}
SkNf sqrt() const { return SkNf(fLo. sqrt(), fHi. sqrt()); }
- SkNf rsqrt() const { return SkNf(fLo.rsqrt(), fHi.rsqrt()); }
+
+ // Generally, increasing precision, increasing cost.
+ SkNf rsqrt0() const { return SkNf(fLo.rsqrt0(), fHi.rsqrt0()); }
+ SkNf rsqrt1() const { return SkNf(fLo.rsqrt1(), fHi.rsqrt1()); }
+ SkNf rsqrt2() const { return SkNf(fLo.rsqrt2(), fHi.rsqrt2()); }
SkNf invert() const { return SkNf(fLo. invert(), fHi. invert()); }
SkNf approxInvert() const { return SkNf(fLo.approxInvert(), fHi.approxInvert()); }
static SkNf Max(const SkNf& l, const SkNf& r) { return SkNf(SkTMax(l.fVal, r.fVal)); }
SkNf sqrt() const { return SkNf(Sqrt(fVal)); }
- SkNf rsqrt() const { return SkNf((T)1 / Sqrt(fVal)); }
+ SkNf rsqrt0() const { return SkNf((T)1 / Sqrt(fVal)); }
+ SkNf rsqrt1() const { return this->rsqrt0(); }
+ SkNf rsqrt2() const { return this->rsqrt1(); }
SkNf invert() const { return SkNf((T)1 / fVal); }
SkNf approxInvert() const { return this->invert(); }
}
}
-// TODO: can we get away with 0th approximatino of inverse-sqrt (i.e. faster than rsqrt)?
-// seems like ~10bits is more than enough for our use, since we want a byte-index
static inline Sk4f fast_sqrt(const Sk4f& R) {
- return R * R.rsqrt();
+ // R * R.rsqrt0() is much faster, but it's non-monotonic, which isn't so pretty for gradients.
+ return R * R.rsqrt1();
}
static inline Sk4f sum_squares(const Sk4f& a, const Sk4f& b) {
static SkNf Min(const SkNf& l, const SkNf& r) { return vmin_f32(l.fVec, r.fVec); }
static SkNf Max(const SkNf& l, const SkNf& r) { return vmax_f32(l.fVec, r.fVec); }
- SkNf rsqrt() const {
- float32x2_t est0 = vrsqrte_f32(fVec),
- est1 = vmul_f32(vrsqrts_f32(fVec, vmul_f32(est0, est0)), est0);
- return est1;
+ SkNf rsqrt0() const { return vrsqrte_f32(fVec); }
+ SkNf rsqrt1() const {
+ float32x2_t est0 = this->rsqrt0().fVec;
+ return vmul_f32(vrsqrts_f32(fVec, vmul_f32(est0, est0)), est0);
+ }
+ SkNf rsqrt2() const {
+ float32x2_t est1 = this->rsqrt1().fVec;
+ return vmul_f32(vrsqrts_f32(fVec, vmul_f32(est1, est1)), est1);
}
SkNf sqrt() const {
#if defined(SK_CPU_ARM64)
return vsqrt_f32(fVec);
#else
- float32x2_t est1 = this->rsqrt().fVec,
- // An extra step of Newton's method to refine the estimate of 1/sqrt(this).
- est2 = vmul_f32(vrsqrts_f32(fVec, vmul_f32(est1, est1)), est1);
- return vmul_f32(fVec, est2);
+ return *this * this->rsqrt2();
#endif
}
static SkNf Max(const SkNf& l, const SkNf& r) { return vmaxq_f64(l.fVec, r.fVec); }
SkNf sqrt() const { return vsqrtq_f64(fVec); }
- SkNf rsqrt() const {
- float64x2_t est0 = vrsqrteq_f64(fVec),
- est1 = vmulq_f64(vrsqrtsq_f64(fVec, vmulq_f64(est0, est0)), est0);
- return est1;
+
+ SkNf rsqrt0() const { return vrsqrteq_f64(fVec); }
+ SkNf rsqrt1() const {
+ float64x2_t est0 = this->rsqrt0().fVec;
+ return vmulq_f64(vrsqrtsq_f64(fVec, vmulq_f64(est0, est0)), est0);
+ }
+ SkNf rsqrt2() const {
+ float64x2_t est1 = this->rsqrt1().fVec;
+ return vmulq_f64(vrsqrtsq_f64(fVec, vmulq_f64(est1, est1)), est1);
}
SkNf approxInvert() const {
static SkNf Min(const SkNf& l, const SkNf& r) { return vminq_f32(l.fVec, r.fVec); }
static SkNf Max(const SkNf& l, const SkNf& r) { return vmaxq_f32(l.fVec, r.fVec); }
- SkNf rsqrt() const {
- float32x4_t est0 = vrsqrteq_f32(fVec),
- est1 = vmulq_f32(vrsqrtsq_f32(fVec, vmulq_f32(est0, est0)), est0);
- return est1;
+ SkNf rsqrt0() const { return vrsqrteq_f32(fVec); }
+ SkNf rsqrt1() const {
+ float32x4_t est0 = this->rsqrt0().fVec;
+ return vmulq_f32(vrsqrtsq_f32(fVec, vmulq_f32(est0, est0)), est0);
+ }
+ SkNf rsqrt2() const {
+ float32x4_t est1 = this->rsqrt1().fVec;
+ return vmulq_f32(vrsqrtsq_f32(fVec, vmulq_f32(est1, est1)), est1);
}
SkNf sqrt() const {
#if defined(SK_CPU_ARM64)
return vsqrtq_f32(fVec);
#else
- float32x4_t est1 = this->rsqrt().fVec,
- // An extra step of Newton's method to refine the estimate of 1/sqrt(this).
- est2 = vmulq_f32(vrsqrtsq_f32(fVec, vmulq_f32(est1, est1)), est1);
- return vmulq_f32(fVec, est2);
+ return *this * this->rsqrt2();
#endif
}
static SkNf Max(const SkNf& l, const SkNf& r) { return _mm_max_ps(l.fVec, r.fVec); }
SkNf sqrt() const { return _mm_sqrt_ps (fVec); }
- SkNf rsqrt() const { return _mm_rsqrt_ps(fVec); }
+ SkNf rsqrt0() const { return _mm_rsqrt_ps(fVec); }
+ SkNf rsqrt1() const { return this->rsqrt0(); }
+ SkNf rsqrt2() const { return this->rsqrt1(); }
SkNf invert() const { return SkNf(1) / *this; }
SkNf approxInvert() const { return _mm_rcp_ps(fVec); }
static SkNf Max(const SkNf& l, const SkNf& r) { return _mm_max_pd(l.fVec, r.fVec); }
SkNf sqrt() const { return _mm_sqrt_pd(fVec); }
- SkNf rsqrt() const { return _mm_cvtps_pd(_mm_rsqrt_ps(_mm_cvtpd_ps(fVec))); }
+ SkNf rsqrt0() const { return _mm_cvtps_pd(_mm_rsqrt_ps(_mm_cvtpd_ps(fVec))); }
+ SkNf rsqrt1() const { return this->rsqrt0(); }
+ SkNf rsqrt2() const { return this->rsqrt1(); }
SkNf invert() const { return SkNf(1) / *this; }
SkNf approxInvert() const { return _mm_cvtps_pd(_mm_rcp_ps(_mm_cvtpd_ps(fVec))); }
static SkNf Max(const SkNf& l, const SkNf& r) { return _mm_max_ps(l.fVec, r.fVec); }
SkNf sqrt() const { return _mm_sqrt_ps (fVec); }
- SkNf rsqrt() const { return _mm_rsqrt_ps(fVec); }
+ SkNf rsqrt0() const { return _mm_rsqrt_ps(fVec); }
+ SkNf rsqrt1() const { return this->rsqrt0(); }
+ SkNf rsqrt2() const { return this->rsqrt1(); }
SkNf invert() const { return SkNf(1) / *this; }
SkNf approxInvert() const { return _mm_rcp_ps(fVec); }
SkNf<N,T> fours(4);
assert_eq(fours.sqrt(), 2,2,2,2);
- assert_nearly_eq(0.001, fours.rsqrt(), 0.5, 0.5, 0.5, 0.5);
+ assert_nearly_eq(0.001, fours.rsqrt0(), 0.5, 0.5, 0.5, 0.5);
+ assert_nearly_eq(0.001, fours.rsqrt1(), 0.5, 0.5, 0.5, 0.5);
+ assert_nearly_eq(0.001, fours.rsqrt2(), 0.5, 0.5, 0.5, 0.5);
assert_eq( fours. invert(), 0.25, 0.25, 0.25, 0.25);
assert_nearly_eq(0.001, fours.approxInvert(), 0.25, 0.25, 0.25, 0.25);