.source 4 s guint8
.param 2 alpha
.temp 4 t
+.temp 4 t2
.temp 2 tw
.temp 1 tb
.temp 4 a
splatbl a, tb
x4 convubw a_wide, a
x4 mullw a_wide, a_wide, alpha
-x4 shruw a_wide, a_wide, 8
+x4 div255w a_wide, a_wide
+
+# dest pixel into t2
+loadl t2, d
+
+# t = s - d (as bytes)
+x4 subb t, t, t2
+
+# s_wide = (uint16)(t) * alpha
x4 convubw s_wide, t
-loadl t, d
-x4 convubw d_wide, t
-x4 subw s_wide, s_wide, d_wide
x4 mullw s_wide, s_wide, a_wide
+
+# s_wide /= 255
x4 div255w s_wide, s_wide
+
+# d_wide = (uint16)(dest) + s_wide
+x4 convubw d_wide, t2
x4 addw d_wide, d_wide, s_wide
+
+# Set output alpha to 0xff and store
x4 convwb t, d_wide
orl t, t, a_alpha
storel d, t
splatbl a, tb
x4 convubw a_wide, a
x4 mullw a_wide, a_wide, alpha
-x4 shruw a_wide, a_wide, 8
+x4 div255w a_wide, a_wide
+
+loadl t2, d
+# t = s - d (as bytes)
+x4 subb t, t, t2
+
x4 convubw s_wide, t
-loadl t, d
-x4 convubw d_wide, t
-x4 subw s_wide, s_wide, d_wide
x4 mullw s_wide, s_wide, a_wide
x4 div255w s_wide, s_wide
+
+x4 convubw d_wide, t2
x4 addw d_wide, d_wide, s_wide
x4 convwb t, d_wide
orl t, t, a_alpha
.const 4 a_alpha 0x000000ff
.const 4 a_alpha_inv 0xffffff00
-# calc source alpha as alpha_s = alpha_s * alpha / 256
+# calc source alpha as alpha_s = alpha_s * alpha / 255
loadl t, s
convlw tw, t
convwb tb, tw
splatbl a, tb
x4 convubw alpha_s, a
x4 mullw alpha_s, alpha_s, alpha
-x4 shruw alpha_s, alpha_s, 8
+x4 div255w alpha_s, alpha_s
x4 convubw s_wide, t
x4 mullw s_wide, s_wide, alpha_s
.const 4 a_alpha 0xff000000
.const 4 a_alpha_inv 0x00ffffff
-# calc source alpha as alpha_s = alpha_s * alpha / 256
+# calc source alpha as alpha_s = alpha_s * alpha / 255
loadl t, s
shrul t2, t, 24
convlw tw, t2
splatbl a, tb
x4 convubw alpha_s, a
x4 mullw alpha_s, alpha_s, alpha
-x4 shruw alpha_s, alpha_s, 8
+x4 div255w alpha_s, alpha_s
x4 convubw s_wide, t
x4 mullw s_wide, s_wide, alpha_s