pixman: Branch 'master'
GitLab Mirror
gitlab-mirror at kemper.freedesktop.org
Wed Oct 30 03:39:43 UTC 2024
meson.build | 22
meson_options.txt | 5
pixman/meson.build | 12
pixman/pixman-combine-float.c | 18
pixman/pixman-combine-float.h | 53 ++
pixman/pixman-implementation.c | 1
pixman/pixman-private.h | 26 +
pixman/pixman-riscv.c | 71 ++
pixman/pixman-rvv.c | 987 +++++++++++++++++++++++++++++++++++++++++
9 files changed, 1173 insertions(+), 22 deletions(-)
New commits:
commit 0e424031bda2e5ea3c7aad680a4446b5b581df31
Author: f wasil <f.wasil at samsung.com>
Date: Wed Oct 30 03:39:37 2024 +0000
RISC-V floating point operations
diff --git a/meson.build b/meson.build
index 3b56f4f..0fd6f01 100644
--- a/meson.build
+++ b/meson.build
@@ -365,6 +365,28 @@ elif use_mips_dspr2.enabled()
error('MIPS DSPr2 Support unavailable, but required')
endif
+use_rvv = get_option('rvv')
+have_rvv = false
+rvv_flags = ['-march=rv64gcv']
+if not use_rvv.disabled()
+ if host_machine.cpu_family() == 'riscv64'
+ if cc.compiles('''
+ #include <riscv_vector.h>
+ int main() { vfloat32m1_t tmp; return 0; }
+ ''',
+ args : rvv_flags,
+ name : 'RISC-V Vector Intrinsic Support')
+ have_rvv = true
+ endif
+ endif
+endif
+
+if have_rvv
+ config.set10('USE_RVV', true)
+elif use_rvv.enabled()
+ error('RISC-V Vector Support unavailable, but required')
+endif
+
use_gnu_asm = get_option('gnu-inline-asm')
if not use_gnu_asm.disabled()
if cc.compiles('''
diff --git a/meson_options.txt b/meson_options.txt
index 0d0eb19..8c68c66 100644
--- a/meson_options.txt
+++ b/meson_options.txt
@@ -63,6 +63,11 @@ option(
type : 'feature',
description : 'Use MIPS32 DSPr2 intrinsic optimized paths',
)
+option(
+ 'rvv',
+ type : 'feature',
+ description : 'Use RISC-V Vector extension',
+)
option(
'gnu-inline-asm',
type : 'feature',
diff --git a/pixman/meson.build b/pixman/meson.build
index a7ca346..ffbce17 100644
--- a/pixman/meson.build
+++ b/pixman/meson.build
@@ -57,6 +57,7 @@ simds = [
['pixman-arma64-neon-asm.S', 'pixman-arma64-neon-asm-bilinear.S']],
['mips-dspr2', have_mips_dspr2, mips_dspr2_flags,
['pixman-mips-dspr2-asm.S', 'pixman-mips-memcpy-asm.S']],
+ ['rvv', have_rvv, rvv_flags, []],
]
foreach simd : simds
@@ -74,18 +75,15 @@ pixman_files = files(
'pixman.c',
'pixman-access.c',
'pixman-access-accessors.c',
+ 'pixman-arm.c',
'pixman-bits-image.c',
'pixman-combine32.c',
'pixman-combine-float.c',
'pixman-conical-gradient.c',
- 'pixman-filter.c',
- 'pixman-x86.c',
- 'pixman-mips.c',
- 'pixman-arm.c',
- 'pixman-ppc.c',
'pixman-edge.c',
'pixman-edge-accessors.c',
'pixman-fast-path.c',
+ 'pixman-filter.c',
'pixman-glyph.c',
'pixman-general.c',
'pixman-gradient-walker.c',
@@ -93,14 +91,18 @@ pixman_files = files(
'pixman-implementation.c',
'pixman-linear-gradient.c',
'pixman-matrix.c',
+ 'pixman-mips.c',
'pixman-noop.c',
+ 'pixman-ppc.c',
'pixman-radial-gradient.c',
'pixman-region16.c',
'pixman-region32.c',
+ 'pixman-riscv.c',
'pixman-solid-fill.c',
'pixman-timer.c',
'pixman-trap.c',
'pixman-utils.c',
+ 'pixman-x86.c',
)
# Android cpu-features
diff --git a/pixman/pixman-combine-float.c b/pixman/pixman-combine-float.c
index 230164f..cf72fdb 100644
--- a/pixman/pixman-combine-float.c
+++ b/pixman/pixman-combine-float.c
@@ -34,6 +34,7 @@
#include <float.h>
#include "pixman-private.h"
+#include "pixman-combine-float.h"
/* Workaround for http://gcc.gnu.org/PR54965 */
/* GCC 4.6 has problems with force_inline, so just use normal inline instead */
@@ -148,23 +149,6 @@ combine_inner (pixman_bool_t component,
/*
* Porter/Duff operators
*/
-typedef enum
-{
- ZERO,
- ONE,
- SRC_ALPHA,
- DEST_ALPHA,
- INV_SA,
- INV_DA,
- SA_OVER_DA,
- DA_OVER_SA,
- INV_SA_OVER_DA,
- INV_DA_OVER_SA,
- ONE_MINUS_SA_OVER_DA,
- ONE_MINUS_DA_OVER_SA,
- ONE_MINUS_INV_DA_OVER_SA,
- ONE_MINUS_INV_SA_OVER_DA
-} combine_factor_t;
#define CLAMP(f) \
(((f) < 0)? 0 : (((f) > 1.0) ? 1.0 : (f)))
diff --git a/pixman/pixman-combine-float.h b/pixman/pixman-combine-float.h
new file mode 100644
index 0000000..09cbbd0
--- /dev/null
+++ b/pixman/pixman-combine-float.h
@@ -0,0 +1,53 @@
+/* -*- Mode: c; c-basic-offset: 4; tab-width: 8; indent-tabs-mode: t; -*- */
+/*
+ * Copyright © 2010, 2012 Soren Sandmann Pedersen
+ * Copyright © 2010, 2012 Red Hat, Inc.
+ * Copyright © 2024 Filip Wasil, Samsung Electronics
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Author: Soren Sandmann Pedersen (sandmann at cs.au.dk)
+ */
+
+#ifndef __PIXMAN_COMBINE_FLOAT_H__
+#define __PIXMAN_COMBINE_FLOAT_H__
+
+/*
+ * Porter/Duff operators
+ */
+typedef enum
+{
+ ZERO,
+ ONE,
+ SRC_ALPHA,
+ DEST_ALPHA,
+ INV_SA,
+ INV_DA,
+ SA_OVER_DA,
+ DA_OVER_SA,
+ INV_SA_OVER_DA,
+ INV_DA_OVER_SA,
+ ONE_MINUS_SA_OVER_DA,
+ ONE_MINUS_DA_OVER_SA,
+ ONE_MINUS_INV_DA_OVER_SA,
+ ONE_MINUS_INV_SA_OVER_DA
+} combine_factor_t;
+
+#endif /*__PIXMAN_COMBINE_FLOAT_H__*/
\ No newline at end of file
diff --git a/pixman/pixman-implementation.c b/pixman/pixman-implementation.c
index 69fa70b..0b12239 100644
--- a/pixman/pixman-implementation.c
+++ b/pixman/pixman-implementation.c
@@ -399,6 +399,7 @@ _pixman_choose_implementation (void)
imp = _pixman_arm_get_implementations (imp);
imp = _pixman_ppc_get_implementations (imp);
imp = _pixman_mips_get_implementations (imp);
+ imp = _pixman_riscv_get_implementations (imp);
imp = _pixman_implementation_create_noop (imp);
diff --git a/pixman/pixman-private.h b/pixman/pixman-private.h
index 9754e46..f377ce3 100644
--- a/pixman/pixman-private.h
+++ b/pixman/pixman-private.h
@@ -655,6 +655,11 @@ pixman_implementation_t *
_pixman_implementation_create_vmx (pixman_implementation_t *fallback);
#endif
+#ifdef USE_RVV
+pixman_implementation_t *
+_pixman_implementation_create_rvv (pixman_implementation_t *fallback);
+#endif
+
pixman_bool_t
_pixman_implementation_disabled (const char *name);
@@ -670,6 +675,9 @@ _pixman_ppc_get_implementations (pixman_implementation_t *imp);
pixman_implementation_t *
_pixman_mips_get_implementations (pixman_implementation_t *imp);
+pixman_implementation_t *
+_pixman_riscv_get_implementations (pixman_implementation_t *imp);
+
pixman_implementation_t *
_pixman_choose_implementation (void);
@@ -816,12 +824,30 @@ get_implementation (void)
return global_implementation;
}
+/* This function is exported for the sake of the test suite and not part
+ * of the ABI.
+ */
+PIXMAN_EXPORT pixman_implementation_t *
+_pixman_internal_only_get_reference_implementation (void);
+
/* This function is exported for the sake of the test suite and not part
* of the ABI.
*/
PIXMAN_EXPORT pixman_implementation_t *
_pixman_internal_only_get_implementation (void);
+/* This function is exported for the sake of the test suite and not part
+ * of the ABI.
+ */
+PIXMAN_EXPORT pixman_fast_path_t *
+_pixman_implementation_get_reference_fast_path (void);
+
+/* This function is exported for the sake of the test suite and not part
+ * of the ABI.
+ */
+PIXMAN_EXPORT int
+_pixman_implementation_get_reference_fast_path_size ();
+
/* Memory allocation helpers */
void *
pixman_malloc_ab (unsigned int n, unsigned int b);
diff --git a/pixman/pixman-riscv.c b/pixman/pixman-riscv.c
new file mode 100644
index 0000000..1f0440f
--- /dev/null
+++ b/pixman/pixman-riscv.c
@@ -0,0 +1,71 @@
+/*
+ * Copyright © 2024 Filip Wasil, Samsung Electronics
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that
+ * copyright notice and this permission notice appear in supporting
+ * documentation, and that the name of Keith Packard not be used in
+ * advertising or publicity pertaining to distribution of the software without
+ * specific, written prior permission. Keith Packard makes no
+ * representations about the suitability of this software for any purpose. It
+ * is provided "as is" without express or implied warranty.
+ *
+ * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
+ * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
+ * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
+ * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
+ * SOFTWARE.
+ */
+
+#ifdef HAVE_CONFIG_H
+#include <pixman-config.h>
+#endif
+
+#include "pixman-private.h"
+
+#ifdef USE_RVV
+
+#if defined(__linux__)
+#include <asm/hwcap.h>
+#include <sys/auxv.h>
+#endif
+
+typedef enum
+{
+ RVV = (1 << 0),
+} riscv_cpu_features_t;
+
+static riscv_cpu_features_t
+detect_cpu_features (void)
+{
+ riscv_cpu_features_t features = 0;
+
+#if defined(__linux__)
+ if (getauxval (AT_HWCAP) & COMPAT_HWCAP_ISA_V)
+ {
+ features |= RVV;
+ }
+#else
+#pragma message( \
+ "warning: RISC-V Vector Extension runtime check not implemented for this platform. RVV will be disabled")
+#endif
+ return features;
+}
+
+#endif
+
+pixman_implementation_t *
+_pixman_riscv_get_implementations (pixman_implementation_t *imp)
+{
+#ifdef USE_RVV
+ if (!_pixman_disabled ("rvv") && (detect_cpu_features () & RVV))
+ {
+ imp = _pixman_implementation_create_rvv (imp);
+ }
+#endif
+ return imp;
+}
diff --git a/pixman/pixman-rvv.c b/pixman/pixman-rvv.c
new file mode 100644
index 0000000..6808f50
--- /dev/null
+++ b/pixman/pixman-rvv.c
@@ -0,0 +1,987 @@
+/*
+ * Copyright © 2000 Keith Packard, member of The XFree86 Project, Inc.
+ * 2005 Lars Knoll & Zack Rusin, Trolltech
+ * 2024 Filip Wasil, Samsung Electronics
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that
+ * copyright notice and this permission notice appear in supporting
+ * documentation, and that the name of Keith Packard not be used in
+ * advertising or publicity pertaining to distribution of the software without
+ * specific, written prior permission. Keith Packard makes no
+ * representations about the suitability of this software for any purpose. It
+ * is provided "as is" without express or implied warranty.
+ *
+ * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
+ * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
+ * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
+ * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
+ * SOFTWARE.
+ */
+
+#ifdef HAVE_CONFIG_H
+#include <pixman-config.h>
+#endif
+
+#include "pixman-combine-float.h"
+#include "pixman-private.h"
+
+#include <riscv_vector.h>
+
+#include <float.h>
+#include <math.h>
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+/*
+ * Screen
+ *
+ * ad * as * B(d/ad, s/as)
+ * = ad * as * (d/ad + s/as - s/as * d/ad)
+ * = ad * s + as * d - s * d
+ */
+
+static force_inline vfloat32m1_t
+rvv_blend_screen (const vfloat32m1_t sa,
+ const vfloat32m1_t s,
+ const vfloat32m1_t da,
+ const vfloat32m1_t d,
+ size_t vl)
+{
+ vfloat32m1_t t0, t1, t2;
+ t0 = __riscv_vfmul_vv_f32m1 (s, da, vl);
+ t1 = __riscv_vfmul_vv_f32m1 (d, sa, vl);
+ t2 = __riscv_vfmul_vv_f32m1 (s, d, vl);
+ return __riscv_vfsub_vv_f32m1 (__riscv_vfadd_vv_f32m1 (t0, t1, vl), t2, vl);
+}
+
+/*
+ * Multiply
+ *
+ * ad * as * B(d / ad, s / as)
+ * = ad * as * d/ad * s/as
+ * = d * s
+ *
+ */
+
+static force_inline vfloat32m1_t
+rvv_blend_multiply (const vfloat32m1_t sa,
+ const vfloat32m1_t s,
+ const vfloat32m1_t da,
+ const vfloat32m1_t d,
+ size_t vl)
+{
+ return __riscv_vfmul_vv_f32m1 (s, d, vl);
+}
+
+/*
+ * Overlay
+ *
+ * ad * as * B(d/ad, s/as)
+ * = ad * as * Hardlight (s, d)
+ * = if (d / ad < 0.5)
+ * as * ad * Multiply (s/as, 2 * d/ad)
+ * else
+ * as * ad * Screen (s/as, 2 * d / ad - 1)
+ * = if (d < 0.5 * ad)
+ * as * ad * s/as * 2 * d /ad
+ * else
+ * as * ad * (s/as + 2 * d / ad - 1 - s / as * (2 * d / ad - 1))
+ * = if (2 * d < ad)
+ * 2 * s * d
+ * else
+ * ad * s + 2 * as * d - as * ad - ad * s * (2 * d / ad - 1)
+ * = if (2 * d < ad)
+ * 2 * s * d
+ * else
+ * as * ad - 2 * (ad - d) * (as - s)
+ */
+
+static force_inline vfloat32m1_t
+rvv_blend_overlay (const vfloat32m1_t sa,
+ const vfloat32m1_t s,
+ const vfloat32m1_t da,
+ const vfloat32m1_t d,
+ size_t vl)
+{
+ vfloat32m1_t t0, t1, t2, t3, t4, f0, f1, f2;
+ vbool32_t vb;
+ t0 = __riscv_vfadd_vv_f32m1 (d, d, vl);
+ t1 = __riscv_vfmul_vv_f32m1 (__riscv_vfadd_vv_f32m1 (s, s, vl), d, vl);
+ vb = __riscv_vmflt_vv_f32m1_b32 (t0, da, vl);
+ t2 = __riscv_vfmul_vv_f32m1 (sa, da, vl);
+ f2 = __riscv_vfsub_vv_f32m1 (da, d, vl);
+ t3 = __riscv_vfmul_vf_f32m1 (f2, 2.0f, vl);
+ t4 = __riscv_vfsub_vv_f32m1 (sa, s, vl);
+ f0 = __riscv_vfmul_vv_f32m1 (t3, t4, vl);
+ f1 = __riscv_vfsub_vv_f32m1 (t2, f0, vl);
+ return __riscv_vmerge_vvm_f32m1 (f1, t1, vb, vl);
+}
+
+/*
+ * Darken
+ *
+ * ad * as * B(d/ad, s/as)
+ * = ad * as * MIN(d/ad, s/as)
+ * = MIN (as * d, ad * s)
+ */
+
+static force_inline vfloat32m1_t
+rvv_blend_darken (const vfloat32m1_t sa,
+ const vfloat32m1_t s,
+ const vfloat32m1_t da,
+ const vfloat32m1_t d,
+ size_t vl)
+{
+ vfloat32m1_t ss, dd;
+ vbool32_t vb;
+ ss = __riscv_vfmul_vv_f32m1 (da, s, vl);
+ dd = __riscv_vfmul_vv_f32m1 (sa, d, vl);
+ vb = __riscv_vmfgt_vv_f32m1_b32 (ss, dd, vl);
+ return __riscv_vmerge_vvm_f32m1 (ss, dd, vb, vl);
+}
+
+/*
+ * Lighten
+ *
+ * ad * as * B(d/ad, s/as)
+ * = ad * as * MAX(d/ad, s/as)
+ * = MAX (as * d, ad * s)
+ */
+
+static force_inline vfloat32m1_t
+rvv_blend_lighten (const vfloat32m1_t sa,
+ const vfloat32m1_t s,
+ const vfloat32m1_t da,
+ const vfloat32m1_t d,
+ size_t vl)
+{
+ vfloat32m1_t ss, dd;
+ vbool32_t vb;
+ ss = __riscv_vfmul_vv_f32m1 (s, da, vl);
+ dd = __riscv_vfmul_vv_f32m1 (d, sa, vl);
+ vb = __riscv_vmfgt_vv_f32m1_b32 (ss, dd, vl);
+ return __riscv_vmerge_vvm_f32m1 (dd, ss, vb, vl);
+}
+
+/*
+ * Color dodge
+ *
+ * ad * as * B(d/ad, s/as)
+ * = if d/ad = 0
+ * ad * as * 0
+ * else if (d/ad >= (1 - s/as)
+ * ad * as * 1
+ * else
+ * ad * as * ((d/ad) / (1 - s/as))
+ * = if d = 0
+ * 0
+ * elif as * d >= ad * (as - s)
+ * ad * as
+ * else
+ * as * (as * d / (as - s))
+ *
+ */
+
+static force_inline vfloat32m1_t
+rvv_blend_color_dodge (const vfloat32m1_t sa,
+ const vfloat32m1_t s,
+ const vfloat32m1_t da,
+ const vfloat32m1_t d,
+ size_t vl)
+{
+ vfloat32m1_t t0, t1, t2, t3, t4;
+ vbool32_t is_d_zero, vb, is_t0_non_zero;
+
+ is_d_zero = __riscv_vmfeq_vf_f32m1_b32 (d, 0.0f, vl);
+
+ t0 = __riscv_vfsub_vv_f32m1 (sa, s, vl); // sa - s
+ t1 = __riscv_vfmul_vv_f32m1 (sa, d, vl); // d * sa
+ t2 = __riscv_vfmul_vv_f32m1 (sa, da, vl); // sa * da
+ t3 = __riscv_vfsub_vv_f32m1 (t2, __riscv_vfmul_vv_f32m1 (s, da, vl),
+ vl); // sa * da - s * da
+
+ is_t0_non_zero = __riscv_vmfne_vf_f32m1_b32 (t0, 0.0f, vl);
+ vb = __riscv_vmflt_vv_f32m1_b32 (t3, t1, vl);
+ t4 = __riscv_vfdiv_vv_f32m1 (__riscv_vfmul_vv_f32m1 (sa, t1, vl), t0,
+ vl); // sa * sa * d / (sa - s);
+
+ return __riscv_vfmerge_vfm_f32m1 (
+ __riscv_vmerge_vvm_f32m1 (
+ __riscv_vmerge_vvm_f32m1 (t2, t4, is_t0_non_zero, vl), t2, vb, vl),
+ 0.0f, is_d_zero, vl);
+}
+
+/*
+ * Color burn
+ *
+ * We modify the first clause "if d = 1" to "if d >= 1" since with
+ * premultiplied colors d > 1 can actually happen.
+ *
+ * ad * as * B(d/ad, s/as)
+ * = if d/ad >= 1
+ * ad * as * 1
+ * elif (1 - d/ad) >= s/as
+ * ad * as * 0
+ * else
+ * ad * as * (1 - ((1 - d/ad) / (s/as)))
+ * = if d >= ad
+ * ad * as
+ * elif as * ad - as * d >= ad * s
+ * 0
+ * else
+ * ad * as - as * as * (ad - d) / s
+ */
+
+static force_inline vfloat32m1_t
+rvv_blend_color_burn (const vfloat32m1_t sa,
+ const vfloat32m1_t s,
+ const vfloat32m1_t da,
+ const vfloat32m1_t d,
+ size_t vl)
+{
+ vfloat32m1_t t0, t1, t2, t3, t4, t5, t6, t7;
+ vbool32_t is_d_ge_da, is_s_zero, vb;
+
+ is_d_ge_da = __riscv_vmfge_vv_f32m1_b32 (d, da, vl);
+ is_s_zero = __riscv_vmfeq_vf_f32m1_b32 (s, 0.0f, vl);
+
+ t0 = __riscv_vfmul_vv_f32m1 (sa, __riscv_vfsub_vv_f32m1 (da, d, vl),
+ vl); // sa * (da - d)
+ t1 = __riscv_vfsub_vv_f32m1 (da, __riscv_vfdiv_vv_f32m1 (t0, s, vl),
+ vl); // da - sa * (da - d) / s)
+ t2 = __riscv_vfmul_vv_f32m1 (sa, da, vl); // sa * da
+ t3 = __riscv_vfmul_vv_f32m1 (sa, t1, vl); // sa * (da - sa * (da - d) / s)
+ t4 = __riscv_vfmul_vv_f32m1 (s, da, vl); // s * da
+ vb = __riscv_vmfge_vf_f32m1_b32 (__riscv_vfsub_vv_f32m1 (t0, t4, vl), 0.0f,
+ vl); // if (sa * (da - d) - s * da >= 0.0f)
+
+ t6 = __riscv_vfmerge_vfm_f32m1 (t3, 0.0f, is_s_zero, vl);
+ t5 = __riscv_vfmerge_vfm_f32m1 (t6, 0.0f, vb, vl);
+ t7 = __riscv_vmerge_vvm_f32m1 (t5, t2, is_d_ge_da, vl);
+
+ return t7;
+}
+
+/*
+ * Hard light
+ *
+ * ad * as * B(d/ad, s/as)
+ * = if (s/as <= 0.5)
+ * ad * as * Multiply (d/ad, 2 * s/as)
+ * else
+ * ad * as * Screen (d/ad, 2 * s/as - 1)
+ * = if 2 * s <= as
+ * ad * as * d/ad * 2 * s / as
+ * else
+ * ad * as * (d/ad + (2 * s/as - 1) + d/ad * (2 * s/as - 1))
+ * = if 2 * s <= as
+ * 2 * s * d
+ * else
+ * as * ad - 2 * (ad - d) * (as - s)
+ */
+
+static force_inline vfloat32m1_t
+rvv_blend_hard_light (const vfloat32m1_t sa,
+ const vfloat32m1_t s,
+ const vfloat32m1_t da,
+ const vfloat32m1_t d,
+ size_t vl)
+{
+ vfloat32m1_t t0, t1, t2, t3, t4;
+ vbool32_t vb;
+ t0 = __riscv_vfadd_vv_f32m1 (s, s, vl);
+ t1 = __riscv_vfmul_vv_f32m1 (__riscv_vfadd_vv_f32m1 (s, s, vl), d, vl);
+ vb = __riscv_vmfgt_vv_f32m1_b32 (t0, sa, vl);
+ t2 = __riscv_vfmul_vv_f32m1 (sa, da, vl);
+ t3 = __riscv_vfmul_vf_f32m1 (__riscv_vfsub_vv_f32m1 (da, d, vl), 2.0f, vl);
+ t4 = __riscv_vfsub_vv_f32m1 (sa, s, vl);
+ return __riscv_vmerge_vvm_f32m1 (
+ t1,
+ __riscv_vfsub_vv_f32m1 (t2, __riscv_vfmul_vv_f32m1 (t3, t4, vl), vl),
+ vb, vl);
+}
+
+/*
+ * Soft light
+ *
+ * ad * as * B(d/ad, s/as)
+ * = if (s/as <= 0.5)
+ * ad * as * (d/ad - (1 - 2 * s/as) * d/ad * (1 - d/ad))
+ * else if (d/ad <= 0.25)
+ * ad * as * (d/ad + (2 * s/as - 1) * ((((16 * d/ad - 12) * d/ad + 4) * d/ad) - d/ad))
+ * else
+ * ad * as * (d/ad + (2 * s/as - 1) * sqrt (d/ad))
+ * = if (2 * s <= as)
+ * d * as - d * (ad - d) * (as - 2 * s) / ad;
+ * else if (4 * d <= ad)
+ * (2 * s - as) * d * ((16 * d / ad - 12) * d / ad + 3);
+ * else
+ * d * as + (sqrt (d * ad) - d) * (2 * s - as);
+ */
+
+static force_inline vfloat32m1_t
+rvv_blend_soft_light (const vfloat32m1_t sa,
+ const vfloat32m1_t s,
+ const vfloat32m1_t da,
+ const vfloat32m1_t d,
+ size_t vl)
+{
+ vfloat32m1_t t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12,
+ t13;
+ vbool32_t is_sa_lt_2s, is_da_ls_4d, is_da_non_zero;
+ is_da_non_zero = __riscv_vmfne_vf_f32m1_b32 (da, 0.0f, vl);
+ t0 = __riscv_vfadd_vv_f32m1 (s, s, vl); // 2 * s
+ is_sa_lt_2s = __riscv_vmflt_vv_f32m1_b32 (sa, t0, vl);
+ t1 = __riscv_vfmul_vv_f32m1 (sa, d, vl); // d * sa
+ t2 = __riscv_vfsub_vv_f32m1 (sa, t0, vl); // (sa - 2*s)
+ t3 = __riscv_vfmul_vv_f32m1 (d, t2, vl); // (sa - 2*s) * d
+ t7 = __riscv_vfdiv_vv_f32m1 (__riscv_vfmul_vf_f32m1 (d, 16.0f, vl), da,
+ vl); // 16 * d / da
+ t8 = __riscv_vfmul_vv_f32m1 (d, __riscv_vfsub_vf_f32m1 (t7, 12.0f, vl),
+ vl); // (16 * d / da - 12) * d
+ t9 = __riscv_vfadd_vf_f32m1 (__riscv_vfdiv_vv_f32m1 (t8, da, vl), 3.0f,
+ vl); // (16 * d / da - 12) * d / da + 3)
+ t4 = __riscv_vfmul_vv_f32m1 (
+ t3, t9, vl); // (sa - 2*s) * d * ((16 * d / da - 12) * d / da + 3)
+ t5 = __riscv_vfsub_vv_f32m1 (
+ t1, t4,
+ vl); // d * sa - (sa - 2*s) * d * ((16 * d / da - 12) * d / da + 3)
+ t6 = __riscv_vfadd_vv_f32m1 (__riscv_vfadd_vv_f32m1 (d, d, vl),
+ __riscv_vfadd_vv_f32m1 (d, d, vl), vl);
+ is_da_ls_4d = __riscv_vmflt_vv_f32m1_b32 (da, t6, vl);
+ t10 = __riscv_vfsub_vv_f32m1 (
+ __riscv_vfsqrt_v_f32m1 (__riscv_vfmul_vv_f32m1 (d, da, vl), vl), d,
+ vl); // sqrtf (d * da) - d
+ t11 = __riscv_vfmul_vv_f32m1 (t2, t10,
+ vl); // (sqrtf (d * da) - d) * (sa - 2 * s)
+ t12 = __riscv_vfsub_vv_f32m1 (
+ t1, t11, vl); // d * sa - (sqrtf (d * da) - d) * (sa - 2 * s)
+ // d * sa - d * (da - d) * (sa - 2 * s) / da
+ t13 = __riscv_vfsub_vv_f32m1 (
+ t1,
+ __riscv_vfdiv_vv_f32m1 (
+ __riscv_vfmul_vv_f32m1 (__riscv_vfmul_vv_f32m1 (d, t2, vl),
+ __riscv_vfsub_vv_f32m1 (da, d, vl), vl),
+ da, vl),
+ vl);
+ return __riscv_vmerge_vvm_f32m1 (
+ t1, // if (!FLOAT_IS_ZERO (da))
+ __riscv_vmerge_vvm_f32m1 (
+ t13, // if (4 * d > da)
+ __riscv_vmerge_vvm_f32m1 (t5, t12, is_da_ls_4d, vl), is_sa_lt_2s,
+ vl),
+ is_da_non_zero, vl);
+}
+
+/*
+ * Difference
+ *
+ * ad * as * B(s/as, d/ad)
+ * = ad * as * abs (s/as - d/ad)
+ * = if (s/as <= d/ad)
+ * ad * as * (d/ad - s/as)
+ * else
+ * ad * as * (s/as - d/ad)
+ * = if (ad * s <= as * d)
+ * as * d - ad * s
+ * else
+ * ad * s - as * d
+ */
+
+static force_inline vfloat32m1_t
+rvv_blend_difference (const vfloat32m1_t sa,
+ const vfloat32m1_t s,
+ const vfloat32m1_t da,
+ const vfloat32m1_t d,
+ size_t vl)
+{
+ vfloat32m1_t dsa, sda;
+ vbool32_t vb;
+ dsa = __riscv_vfmul_vv_f32m1 (d, sa, vl);
+ sda = __riscv_vfmul_vv_f32m1 (s, da, vl);
+ vb = __riscv_vmflt_vv_f32m1_b32 (sda, dsa, vl);
+ return __riscv_vmerge_vvm_f32m1 (__riscv_vfsub_vv_f32m1 (sda, dsa, vl),
+ __riscv_vfsub_vv_f32m1 (dsa, sda, vl), vb,
+ vl);
+}
+
+/*
+ * Exclusion
+ *
+ * ad * as * B(s/as, d/ad)
+ * = ad * as * (d/ad + s/as - 2 * d/ad * s/as)
+ * = as * d + ad * s - 2 * s * d
+ */
+
+static force_inline vfloat32m1_t
+rvv_blend_exclusion (const vfloat32m1_t sa,
+ const vfloat32m1_t s,
+ const vfloat32m1_t da,
+ const vfloat32m1_t d,
+ size_t vl)
+{
+ vfloat32m1_t t0, t1;
+ t0 = __riscv_vfmul_vv_f32m1 (__riscv_vfadd_vv_f32m1 (d, d, vl), s, vl);
+ t1 = __riscv_vfadd_vv_f32m1 (__riscv_vfmul_vv_f32m1 (s, da, vl),
+ __riscv_vfmul_vv_f32m1 (d, sa, vl), vl);
+ return __riscv_vfsub_vv_f32m1 (t1, t0, vl);
+}
+
+typedef vfloat32m1_t (*rvv_combine_channel_float_t) (const vfloat32m1_t sa,
+ const vfloat32m1_t s,
+ const vfloat32m1_t da,
+ const vfloat32m1_t d,
+ size_t vl);
+
+static force_inline void
+rvv_combine_inner (pixman_bool_t component,
+ float *dest,
+ const float *src,
+ const float *mask,
+ int n_pixels,
+ rvv_combine_channel_float_t combine_a,
+ rvv_combine_channel_float_t combine_c)
+{
+ float *__restrict__ pd = dest;
+ const float *__restrict__ ps = src;
+ const float *__restrict__ pm = mask;
+
+ const int component_count = 4;
+ int vn = component_count * n_pixels;
+ int vl = 0;
+ int vl_step = 0;
+
+ const ptrdiff_t stride = component_count * sizeof (float);
+
+ vfloat32m1x4_t sa_sr_sg_sb, da_dr_dg_db, ma_mr_mg_mb;
+ vfloat32m1_t da2, dr2, dg2, db2, ma2, mr2, mg2, mb2, sr2, sg2, sb2, sa2;
+
+ if (n_pixels == 0)
+ {
+ return;
+ }
+
+ if (!mask)
+ {
+ for (; vn > 0; vn -= vl_step, pd += vl_step, ps += vl_step)
+ {
+ vl = __riscv_vsetvl_e32m1 (vn / component_count);
+ sa_sr_sg_sb = __riscv_vlseg4e32_v_f32m1x4 (ps, vl);
+ da_dr_dg_db = __riscv_vlseg4e32_v_f32m1x4 (pd, vl);
+
+ da2 = combine_a (__riscv_vget_v_f32m1x4_f32m1 (sa_sr_sg_sb, 0),
+ __riscv_vget_v_f32m1x4_f32m1 (sa_sr_sg_sb, 0),
+ __riscv_vget_v_f32m1x4_f32m1 (da_dr_dg_db, 0),
+ __riscv_vget_v_f32m1x4_f32m1 (da_dr_dg_db, 0), vl);
+
+ dr2 = combine_c (__riscv_vget_v_f32m1x4_f32m1 (sa_sr_sg_sb, 0),
+ __riscv_vget_v_f32m1x4_f32m1 (sa_sr_sg_sb, 1),
+ __riscv_vget_v_f32m1x4_f32m1 (da_dr_dg_db, 0),
+ __riscv_vget_v_f32m1x4_f32m1 (da_dr_dg_db, 1), vl);
+
+ dg2 = combine_c (__riscv_vget_v_f32m1x4_f32m1 (sa_sr_sg_sb, 0),
+ __riscv_vget_v_f32m1x4_f32m1 (sa_sr_sg_sb, 2),
+ __riscv_vget_v_f32m1x4_f32m1 (da_dr_dg_db, 0),
+ __riscv_vget_v_f32m1x4_f32m1 (da_dr_dg_db, 2), vl);
+
+ db2 = combine_c (__riscv_vget_v_f32m1x4_f32m1 (sa_sr_sg_sb, 0),
+ __riscv_vget_v_f32m1x4_f32m1 (sa_sr_sg_sb, 3),
+ __riscv_vget_v_f32m1x4_f32m1 (da_dr_dg_db, 0),
+ __riscv_vget_v_f32m1x4_f32m1 (da_dr_dg_db, 3), vl);
+
+ __riscv_vsseg4e32_v_f32m1x4 (
+ pd, __riscv_vcreate_v_f32m1x4 (da2, dr2, dg2, db2), vl);
+
+ vl_step = vl * component_count;
+ }
+ }
+ else
+ {
+ if (component)
+ {
+ for (; vn > 0;
+ vn -= vl_step, pd += vl_step, ps += vl_step, pm += vl_step)
+ {
+ vl = __riscv_vsetvl_e32m1 (vn / component_count);
+
+ sa_sr_sg_sb = __riscv_vlseg4e32_v_f32m1x4 (ps, vl);
+ da_dr_dg_db = __riscv_vlseg4e32_v_f32m1x4 (pd, vl);
+ ma_mr_mg_mb = __riscv_vlseg4e32_v_f32m1x4 (pm, vl);
+
+ sr2 = __riscv_vfmul_vv_f32m1 (
+ __riscv_vget_v_f32m1x4_f32m1 (sa_sr_sg_sb, 1),
+ __riscv_vget_v_f32m1x4_f32m1 (ma_mr_mg_mb, 1), vl);
+
+ sg2 = __riscv_vfmul_vv_f32m1 (
+ __riscv_vget_v_f32m1x4_f32m1 (sa_sr_sg_sb, 2),
+ __riscv_vget_v_f32m1x4_f32m1 (ma_mr_mg_mb, 2), vl);
+
+ sb2 = __riscv_vfmul_vv_f32m1 (
+ __riscv_vget_v_f32m1x4_f32m1 (sa_sr_sg_sb, 3),
+ __riscv_vget_v_f32m1x4_f32m1 (ma_mr_mg_mb, 3), vl);
+
+ ma2 = __riscv_vfmul_vv_f32m1 (
+ __riscv_vget_v_f32m1x4_f32m1 (ma_mr_mg_mb, 0),
+ __riscv_vget_v_f32m1x4_f32m1 (sa_sr_sg_sb, 0), vl);
+
+ mr2 = __riscv_vfmul_vv_f32m1 (
+ __riscv_vget_v_f32m1x4_f32m1 (ma_mr_mg_mb, 1),
+ __riscv_vget_v_f32m1x4_f32m1 (sa_sr_sg_sb, 0), vl);
+
+ mg2 = __riscv_vfmul_vv_f32m1 (
+ __riscv_vget_v_f32m1x4_f32m1 (ma_mr_mg_mb, 2),
+ __riscv_vget_v_f32m1x4_f32m1 (sa_sr_sg_sb, 0), vl);
+
+ mb2 = __riscv_vfmul_vv_f32m1 (
+ __riscv_vget_v_f32m1x4_f32m1 (ma_mr_mg_mb, 3),
+ __riscv_vget_v_f32m1x4_f32m1 (sa_sr_sg_sb, 0), vl);
+
+ da2 = combine_a (
+ ma2, ma2, __riscv_vget_v_f32m1x4_f32m1 (da_dr_dg_db, 0),
+ __riscv_vget_v_f32m1x4_f32m1 (da_dr_dg_db, 0), vl);
+
+ dr2 = combine_c (
+ mr2, sr2, __riscv_vget_v_f32m1x4_f32m1 (da_dr_dg_db, 0),
+ __riscv_vget_v_f32m1x4_f32m1 (da_dr_dg_db, 1), vl);
+
+ dg2 = combine_c (
+ mg2, sg2, __riscv_vget_v_f32m1x4_f32m1 (da_dr_dg_db, 0),
+ __riscv_vget_v_f32m1x4_f32m1 (da_dr_dg_db, 2), vl);
+
+ db2 = combine_c (
+ mb2, sb2, __riscv_vget_v_f32m1x4_f32m1 (da_dr_dg_db, 0),
+ __riscv_vget_v_f32m1x4_f32m1 (da_dr_dg_db, 3), vl);
+
+ __riscv_vsseg4e32_v_f32m1x4 (
+ pd, __riscv_vcreate_v_f32m1x4 (da2, dr2, dg2, db2), vl);
+
+ vl_step = vl * component_count;
+ }
+ }
+ else
+ {
+ for (; vn > 0;
+ vn -= vl_step, pd += vl_step, ps += vl_step, pm += vl_step)
+ {
+ vl = __riscv_vsetvl_e32m1 (vn / component_count);
+
+ sa_sr_sg_sb = __riscv_vlseg4e32_v_f32m1x4 (ps, vl);
+ da_dr_dg_db = __riscv_vlseg4e32_v_f32m1x4 (pd, vl);
+ ma2 = __riscv_vlse32_v_f32m1 (pm, stride, vl);
+
+ sa2 = __riscv_vfmul_vv_f32m1 (
+ ma2, __riscv_vget_v_f32m1x4_f32m1 (sa_sr_sg_sb, 0), vl);
+ sr2 = __riscv_vfmul_vv_f32m1 (
+ ma2, __riscv_vget_v_f32m1x4_f32m1 (sa_sr_sg_sb, 1), vl);
+ sg2 = __riscv_vfmul_vv_f32m1 (
+ ma2, __riscv_vget_v_f32m1x4_f32m1 (sa_sr_sg_sb, 2), vl);
+ sb2 = __riscv_vfmul_vv_f32m1 (
+ ma2, __riscv_vget_v_f32m1x4_f32m1 (sa_sr_sg_sb, 3), vl);
+
+ ma2 = sa2;
+
+ dr2 = combine_c (
+ ma2, sr2, __riscv_vget_v_f32m1x4_f32m1 (da_dr_dg_db, 0),
+ __riscv_vget_v_f32m1x4_f32m1 (da_dr_dg_db, 1), vl);
+
+ dg2 = combine_c (
+ ma2, sg2, __riscv_vget_v_f32m1x4_f32m1 (da_dr_dg_db, 0),
+ __riscv_vget_v_f32m1x4_f32m1 (da_dr_dg_db, 2), vl);
+
+ db2 = combine_c (
+ ma2, sb2, __riscv_vget_v_f32m1x4_f32m1 (da_dr_dg_db, 0),
+ __riscv_vget_v_f32m1x4_f32m1 (da_dr_dg_db, 3), vl);
+
+ da2 = combine_a (
+ ma2, sa2, __riscv_vget_v_f32m1x4_f32m1 (da_dr_dg_db, 0),
+ __riscv_vget_v_f32m1x4_f32m1 (da_dr_dg_db, 0), vl);
+
+ __riscv_vsseg4e32_v_f32m1x4 (
+ pd, __riscv_vcreate_v_f32m1x4 (da2, dr2, dg2, db2), vl);
+
+ vl_step = vl * component_count;
+ }
+ }
+ }
+}
+
+#define RVV_MAKE_COMBINER(name, component, combine_a, combine_c) \
+ static void rvv_combine_##name##_float ( \
+ pixman_implementation_t *imp, pixman_op_t op, float *dest, \
+ const float *src, const float *mask, int n_pixels) \
+ { \
+ rvv_combine_inner (component, dest, src, mask, n_pixels, combine_a, \
+ combine_c); \
+ }
+
+#define RVV_MAKE_COMBINERS(name, combine_a, combine_c) \
+ RVV_MAKE_COMBINER (name##_ca, TRUE, combine_a, combine_c) \
+ RVV_MAKE_COMBINER (name##_u, FALSE, combine_a, combine_c)
+
+static force_inline vfloat32m1_t
+rvv_get_factor (combine_factor_t factor,
+ vfloat32m1_t sa,
+ vfloat32m1_t da,
+ size_t vl)
+{
+ vfloat32m1_t vone = __riscv_vfmv_v_f_f32m1 (1.0f, vl);
+ vfloat32m1_t vzero = __riscv_vfmv_v_f_f32m1 (0.0f, vl);
+
+ switch (factor)
+ {
+ case ZERO:
+ return vzero;
+
+ case ONE:
+ return vone;
+
+ case SRC_ALPHA:
+ return sa;
+
+ case DEST_ALPHA:
+ return da;
+
+ case INV_SA:
+ return __riscv_vfsub_vv_f32m1 (vone, sa, vl);
+
+ case INV_DA:
+ return __riscv_vfsub_vv_f32m1 (vone, da, vl);
+
+ case SA_OVER_DA:
+ return __riscv_vmerge_vvm_f32m1 (
+ vone,
+ __riscv_vfmin_vv_f32m1 (
+ vone,
+ __riscv_vfmax_vv_f32m1 (
+ vzero, __riscv_vfdiv_vv_f32m1 (sa, da, vl), vl),
+ vl),
+ __riscv_vmfne_vf_f32m1_b32 (da, 0.0f, vl), vl);
+
+ case DA_OVER_SA:
+ return __riscv_vmerge_vvm_f32m1 (
+ __riscv_vfmin_vv_f32m1 (
+ vone,
+ __riscv_vfmax_vv_f32m1 (
+ vzero, __riscv_vfdiv_vv_f32m1 (da, sa, vl), vl),
+ vl),
+ vone, __riscv_vmfeq_vf_f32m1_b32 (sa, 0.0f, vl), vl);
+
+ case INV_SA_OVER_DA:
+ {
+ vfloat32m1_t t0 = __riscv_vfdiv_vv_f32m1 (
+ __riscv_vfsub_vv_f32m1 (vone, sa, vl), da, vl);
+ return __riscv_vmerge_vvm_f32m1 (
+ vone,
+ __riscv_vfmin_vv_f32m1 (
+ vone, __riscv_vfmax_vv_f32m1 (vzero, t0, vl), vl),
+ __riscv_vmfne_vf_f32m1_b32 (da, 0.0f, vl), vl);
+ }
+
+ case INV_DA_OVER_SA:
+ {
+ vfloat32m1_t t0 = __riscv_vfdiv_vv_f32m1 (
+ __riscv_vfsub_vv_f32m1 (vone, da, vl), sa, vl);
+ return __riscv_vmerge_vvm_f32m1 (
+ vone,
+ __riscv_vfmin_vv_f32m1 (
+ vone, __riscv_vfmax_vv_f32m1 (vzero, t0, vl), vl),
+ __riscv_vmfne_vf_f32m1_b32 (sa, 0.0f, vl), vl);
+ }
+
+ case ONE_MINUS_SA_OVER_DA:
+ {
+ vfloat32m1_t t0 = __riscv_vfsub_vv_f32m1 (
+ vone, __riscv_vfdiv_vv_f32m1 (sa, da, vl), vl);
+ return __riscv_vmerge_vvm_f32m1 (
+ vzero,
+ __riscv_vfmin_vv_f32m1 (
+ vone, __riscv_vfmax_vv_f32m1 (vzero, t0, vl), vl),
+ __riscv_vmfne_vf_f32m1_b32 (da, 0.0f, vl), vl);
+ }
+
+ case ONE_MINUS_DA_OVER_SA:
+ {
+ vfloat32m1_t t0 = __riscv_vfsub_vv_f32m1 (
+ vone, __riscv_vfdiv_vv_f32m1 (da, sa, vl), vl);
+ return __riscv_vmerge_vvm_f32m1 (
+ vzero,
+ __riscv_vfmin_vv_f32m1 (
+ vone, __riscv_vfmax_vv_f32m1 (vzero, t0, vl), vl),
+ __riscv_vmfne_vf_f32m1_b32 (sa, 0.0f, vl), vl);
+ }
+
+ case ONE_MINUS_INV_DA_OVER_SA:
+ {
+ vbool32_t is_zero = __riscv_vmand_mm_b32 (
+ __riscv_vmflt_vf_f32m1_b32 (sa, FLT_MIN, vl),
+ __riscv_vmfgt_vf_f32m1_b32 (sa, -FLT_MAX, vl), vl);
+ vfloat32m1_t t0 = __riscv_vfsub_vv_f32m1 (
+ vone,
+ __riscv_vfdiv_vv_f32m1 (
+ __riscv_vfsub_vv_f32m1 (vone, da, vl), sa, vl),
+ vl);
+ return __riscv_vmerge_vvm_f32m1 (
+ __riscv_vfmin_vv_f32m1 (
+ vone, __riscv_vfmax_vv_f32m1 (vzero, t0, vl), vl),
+ vzero, is_zero, vl);
+ }
+
+ case ONE_MINUS_INV_SA_OVER_DA:
+ {
+ vfloat32m1_t t0 = __riscv_vfsub_vv_f32m1 (
+ vone,
+ __riscv_vfdiv_vv_f32m1 (
+ __riscv_vfsub_vv_f32m1 (vone, sa, vl), da, vl),
+ vl);
+ return __riscv_vmerge_vvm_f32m1 (
+ __riscv_vfmin_vv_f32m1 (
+ vone, __riscv_vfmax_vv_f32m1 (vzero, t0, vl), vl),
+ vzero, __riscv_vmfeq_vf_f32m1_b32 (da, 0.0f, vl), vl);
+ }
+ }
+
+ return __riscv_vfmv_v_f_f32m1 (-1.0f, vl);
+}
+
+#define RVV_MAKE_PD_COMBINERS(name, a, b) \
+ static vfloat32m1_t force_inline rvv_pd_combine_##name ( \
+ vfloat32m1_t sa, vfloat32m1_t s, vfloat32m1_t da, vfloat32m1_t d, \
+ size_t vl) \
+ { \
+ const vfloat32m1_t fa = rvv_get_factor (a, sa, da, vl); \
+ const vfloat32m1_t fb = rvv_get_factor (b, sa, da, vl); \
+ vfloat32m1_t t0 = __riscv_vfadd_vv_f32m1 ( \
+ __riscv_vfmul_vv_f32m1 (s, fa, vl), \
+ __riscv_vfmul_vv_f32m1 (d, fb, vl), vl); \
+ return __riscv_vfmin_vv_f32m1 (__riscv_vfmv_v_f_f32m1 (1.0f, vl), t0, \
+ vl); \
+ } \
+ \
+ RVV_MAKE_COMBINERS (name, rvv_pd_combine_##name, rvv_pd_combine_##name)
+
+RVV_MAKE_PD_COMBINERS (clear, ZERO, ZERO)
+RVV_MAKE_PD_COMBINERS (src, ONE, ZERO)
+RVV_MAKE_PD_COMBINERS (dst, ZERO, ONE)
+RVV_MAKE_PD_COMBINERS (over, ONE, INV_SA)
+RVV_MAKE_PD_COMBINERS (over_reverse, INV_DA, ONE)
+RVV_MAKE_PD_COMBINERS (in, DEST_ALPHA, ZERO)
+RVV_MAKE_PD_COMBINERS (in_reverse, ZERO, SRC_ALPHA)
+RVV_MAKE_PD_COMBINERS (out, INV_DA, ZERO)
+RVV_MAKE_PD_COMBINERS (out_reverse, ZERO, INV_SA)
+RVV_MAKE_PD_COMBINERS (atop, DEST_ALPHA, INV_SA)
+RVV_MAKE_PD_COMBINERS (atop_reverse, INV_DA, SRC_ALPHA)
+RVV_MAKE_PD_COMBINERS (xor, INV_DA, INV_SA)
+RVV_MAKE_PD_COMBINERS (add, ONE, ONE)
+
+RVV_MAKE_PD_COMBINERS (saturate, INV_DA_OVER_SA, ONE)
+
+RVV_MAKE_PD_COMBINERS (disjoint_clear, ZERO, ZERO)
+RVV_MAKE_PD_COMBINERS (disjoint_src, ONE, ZERO)
+RVV_MAKE_PD_COMBINERS (disjoint_dst, ZERO, ONE)
+RVV_MAKE_PD_COMBINERS (disjoint_over, ONE, INV_SA_OVER_DA)
+RVV_MAKE_PD_COMBINERS (disjoint_over_reverse, INV_DA_OVER_SA, ONE)
+RVV_MAKE_PD_COMBINERS (disjoint_in, ONE_MINUS_INV_DA_OVER_SA, ZERO)
+RVV_MAKE_PD_COMBINERS (disjoint_in_reverse, ZERO, ONE_MINUS_INV_SA_OVER_DA)
+RVV_MAKE_PD_COMBINERS (disjoint_out, INV_DA_OVER_SA, ZERO)
+RVV_MAKE_PD_COMBINERS (disjoint_out_reverse, ZERO, INV_SA_OVER_DA)
+RVV_MAKE_PD_COMBINERS (disjoint_atop, ONE_MINUS_INV_DA_OVER_SA, INV_SA_OVER_DA)
+RVV_MAKE_PD_COMBINERS (disjoint_atop_reverse,
+ INV_DA_OVER_SA,
+ ONE_MINUS_INV_SA_OVER_DA)
+RVV_MAKE_PD_COMBINERS (disjoint_xor, INV_DA_OVER_SA, INV_SA_OVER_DA)
+
+RVV_MAKE_PD_COMBINERS (conjoint_clear, ZERO, ZERO)
+RVV_MAKE_PD_COMBINERS (conjoint_src, ONE, ZERO)
+RVV_MAKE_PD_COMBINERS (conjoint_dst, ZERO, ONE)
+RVV_MAKE_PD_COMBINERS (conjoint_over, ONE, ONE_MINUS_SA_OVER_DA)
+RVV_MAKE_PD_COMBINERS (conjoint_over_reverse, ONE_MINUS_DA_OVER_SA, ONE)
+RVV_MAKE_PD_COMBINERS (conjoint_in, DA_OVER_SA, ZERO)
+RVV_MAKE_PD_COMBINERS (conjoint_in_reverse, ZERO, SA_OVER_DA)
+RVV_MAKE_PD_COMBINERS (conjoint_out, ONE_MINUS_DA_OVER_SA, ZERO)
+RVV_MAKE_PD_COMBINERS (conjoint_out_reverse, ZERO, ONE_MINUS_SA_OVER_DA)
+RVV_MAKE_PD_COMBINERS (conjoint_atop, DA_OVER_SA, ONE_MINUS_SA_OVER_DA)
+RVV_MAKE_PD_COMBINERS (conjoint_atop_reverse, ONE_MINUS_DA_OVER_SA, SA_OVER_DA)
+RVV_MAKE_PD_COMBINERS (conjoint_xor, ONE_MINUS_DA_OVER_SA, ONE_MINUS_SA_OVER_DA)
+
+#define RVV_MAKE_SEPARABLE_PDF_COMBINERS(name) \
+ static force_inline vfloat32m1_t rvv_combine_##name##_a ( \
+ vfloat32m1_t sa, vfloat32m1_t s, vfloat32m1_t da, vfloat32m1_t d, \
+ size_t vl) \
+ { \
+ return __riscv_vfsub_vv_f32m1 (__riscv_vfadd_vv_f32m1 (da, sa, vl), \
+ __riscv_vfmul_vv_f32m1 (da, sa, vl), \
+ vl); \
+ } \
+ \
+ static force_inline vfloat32m1_t rvv_combine_##name##_c ( \
+ vfloat32m1_t sa, vfloat32m1_t s, vfloat32m1_t da, vfloat32m1_t d, \
+ size_t vl) \
+ { \
+ vfloat32m1_t f = __riscv_vfmul_vf_f32m1 ( \
+ __riscv_vfadd_vv_f32m1 ( \
+ __riscv_vfmul_vv_f32m1 (__riscv_vfsub_vf_f32m1 (sa, 1.0f, vl), \
+ d, vl), \
+ __riscv_vfmul_vv_f32m1 (__riscv_vfsub_vf_f32m1 (da, 1.0f, vl), \
+ s, vl), \
+ vl), \
+ -1.0f, vl); \
+ \
+ return __riscv_vfadd_vv_f32m1 (f, rvv_blend_##name (sa, s, da, d, vl), \
+ vl); \
+ } \
+ \
+ RVV_MAKE_COMBINERS (name, rvv_combine_##name##_a, rvv_combine_##name##_c)
+
+RVV_MAKE_SEPARABLE_PDF_COMBINERS (multiply)
+RVV_MAKE_SEPARABLE_PDF_COMBINERS (screen)
+RVV_MAKE_SEPARABLE_PDF_COMBINERS (overlay)
+RVV_MAKE_SEPARABLE_PDF_COMBINERS (darken)
+RVV_MAKE_SEPARABLE_PDF_COMBINERS (lighten)
+RVV_MAKE_SEPARABLE_PDF_COMBINERS (color_dodge)
+RVV_MAKE_SEPARABLE_PDF_COMBINERS (color_burn)
+RVV_MAKE_SEPARABLE_PDF_COMBINERS (hard_light)
+RVV_MAKE_SEPARABLE_PDF_COMBINERS (soft_light)
+RVV_MAKE_SEPARABLE_PDF_COMBINERS (difference)
+RVV_MAKE_SEPARABLE_PDF_COMBINERS (exclusion)
+
+static const pixman_fast_path_t rvv_fast_paths[] = {
+ {PIXMAN_OP_NONE},
+};
+
+// clang-format off
+pixman_implementation_t *
+_pixman_implementation_create_rvv (pixman_implementation_t *fallback)
+{
+ pixman_implementation_t *imp = _pixman_implementation_create (fallback, rvv_fast_paths);
+
+ imp->combine_float[PIXMAN_OP_CLEAR] = rvv_combine_clear_u_float;
+ imp->combine_float[PIXMAN_OP_SRC] = rvv_combine_src_u_float;
+ imp->combine_float[PIXMAN_OP_DST] = rvv_combine_dst_u_float;
+ imp->combine_float[PIXMAN_OP_OVER] = rvv_combine_over_u_float;
+ imp->combine_float[PIXMAN_OP_OVER_REVERSE] = rvv_combine_over_reverse_u_float;
+ imp->combine_float[PIXMAN_OP_IN] = rvv_combine_in_u_float;
+ imp->combine_float[PIXMAN_OP_IN_REVERSE] = rvv_combine_in_reverse_u_float;
+ imp->combine_float[PIXMAN_OP_OUT] = rvv_combine_out_u_float;
+ imp->combine_float[PIXMAN_OP_OUT_REVERSE] = rvv_combine_out_reverse_u_float;
+ imp->combine_float[PIXMAN_OP_ATOP] = rvv_combine_atop_u_float;
+ imp->combine_float[PIXMAN_OP_ATOP_REVERSE] = rvv_combine_atop_reverse_u_float;
+ imp->combine_float[PIXMAN_OP_XOR] = rvv_combine_xor_u_float;
+ imp->combine_float[PIXMAN_OP_ADD] = rvv_combine_add_u_float;
+ imp->combine_float[PIXMAN_OP_SATURATE] = rvv_combine_saturate_u_float;
+
+ /* Disjoint, unified */
+ imp->combine_float[PIXMAN_OP_DISJOINT_CLEAR] = rvv_combine_disjoint_clear_u_float;
+ imp->combine_float[PIXMAN_OP_DISJOINT_SRC] = rvv_combine_disjoint_src_u_float;
+ imp->combine_float[PIXMAN_OP_DISJOINT_DST] = rvv_combine_disjoint_dst_u_float;
+ imp->combine_float[PIXMAN_OP_DISJOINT_OVER] = rvv_combine_disjoint_over_u_float;
+ imp->combine_float[PIXMAN_OP_DISJOINT_OVER_REVERSE] = rvv_combine_disjoint_over_reverse_u_float;
+ imp->combine_float[PIXMAN_OP_DISJOINT_IN] = rvv_combine_disjoint_in_u_float;
+ imp->combine_float[PIXMAN_OP_DISJOINT_IN_REVERSE] = rvv_combine_disjoint_in_reverse_u_float;
+ imp->combine_float[PIXMAN_OP_DISJOINT_OUT] = rvv_combine_disjoint_out_u_float;
+ imp->combine_float[PIXMAN_OP_DISJOINT_OUT_REVERSE] = rvv_combine_disjoint_out_reverse_u_float;
+ imp->combine_float[PIXMAN_OP_DISJOINT_ATOP] = rvv_combine_disjoint_atop_u_float;
+ imp->combine_float[PIXMAN_OP_DISJOINT_ATOP_REVERSE] = rvv_combine_disjoint_atop_reverse_u_float;
+ imp->combine_float[PIXMAN_OP_DISJOINT_XOR] = rvv_combine_disjoint_xor_u_float;
+
+ /* Conjoint, unified */
+ imp->combine_float[PIXMAN_OP_CONJOINT_CLEAR] = rvv_combine_conjoint_clear_u_float;
+ imp->combine_float[PIXMAN_OP_CONJOINT_SRC] = rvv_combine_conjoint_src_u_float;
+ imp->combine_float[PIXMAN_OP_CONJOINT_DST] = rvv_combine_conjoint_dst_u_float;
+ imp->combine_float[PIXMAN_OP_CONJOINT_OVER] = rvv_combine_conjoint_over_u_float;
+ imp->combine_float[PIXMAN_OP_CONJOINT_OVER_REVERSE] = rvv_combine_conjoint_over_reverse_u_float;
+ imp->combine_float[PIXMAN_OP_CONJOINT_IN] = rvv_combine_conjoint_in_u_float;
+ imp->combine_float[PIXMAN_OP_CONJOINT_IN_REVERSE] = rvv_combine_conjoint_in_reverse_u_float;
+ imp->combine_float[PIXMAN_OP_CONJOINT_OUT] = rvv_combine_conjoint_out_u_float;
+ imp->combine_float[PIXMAN_OP_CONJOINT_OUT_REVERSE] = rvv_combine_conjoint_out_reverse_u_float;
+ imp->combine_float[PIXMAN_OP_CONJOINT_ATOP] = rvv_combine_conjoint_atop_u_float;
+ imp->combine_float[PIXMAN_OP_CONJOINT_ATOP_REVERSE] = rvv_combine_conjoint_atop_reverse_u_float;
+ imp->combine_float[PIXMAN_OP_CONJOINT_XOR] = rvv_combine_conjoint_xor_u_float;
+
+ /* PDF operators, unified */
+ imp->combine_float[PIXMAN_OP_MULTIPLY] = rvv_combine_multiply_u_float;
+ imp->combine_float[PIXMAN_OP_SCREEN] = rvv_combine_screen_u_float;
+ imp->combine_float[PIXMAN_OP_OVERLAY] = rvv_combine_overlay_u_float;
+ imp->combine_float[PIXMAN_OP_DARKEN] = rvv_combine_darken_u_float;
+ imp->combine_float[PIXMAN_OP_LIGHTEN] = rvv_combine_lighten_u_float;
+ imp->combine_float[PIXMAN_OP_HARD_LIGHT] = rvv_combine_hard_light_u_float;
+ imp->combine_float[PIXMAN_OP_SOFT_LIGHT] = rvv_combine_soft_light_u_float;
+ imp->combine_float[PIXMAN_OP_DIFFERENCE] = rvv_combine_difference_u_float;
+ imp->combine_float[PIXMAN_OP_EXCLUSION] = rvv_combine_exclusion_u_float;
+ imp->combine_float[PIXMAN_OP_COLOR_DODGE] = rvv_combine_color_dodge_u_float;
+ imp->combine_float[PIXMAN_OP_COLOR_BURN] = rvv_combine_color_burn_u_float;
+
+ /* Component alpha combiners */
+ imp->combine_float_ca[PIXMAN_OP_CLEAR] = rvv_combine_clear_ca_float;
+ imp->combine_float_ca[PIXMAN_OP_SRC] = rvv_combine_src_ca_float;
+ imp->combine_float_ca[PIXMAN_OP_DST] = rvv_combine_dst_ca_float;
+ imp->combine_float_ca[PIXMAN_OP_OVER] = rvv_combine_over_ca_float;
+ imp->combine_float_ca[PIXMAN_OP_OVER_REVERSE] = rvv_combine_over_reverse_ca_float;
+ imp->combine_float_ca[PIXMAN_OP_IN] = rvv_combine_in_ca_float;
+ imp->combine_float_ca[PIXMAN_OP_IN_REVERSE] = rvv_combine_in_reverse_ca_float;
+ imp->combine_float_ca[PIXMAN_OP_OUT] = rvv_combine_out_ca_float;
+ imp->combine_float_ca[PIXMAN_OP_OUT_REVERSE] = rvv_combine_out_reverse_ca_float;
+ imp->combine_float_ca[PIXMAN_OP_ATOP] = rvv_combine_atop_ca_float;
+ imp->combine_float_ca[PIXMAN_OP_ATOP_REVERSE] = rvv_combine_atop_reverse_ca_float;
+ imp->combine_float_ca[PIXMAN_OP_XOR] = rvv_combine_xor_ca_float;
+ imp->combine_float_ca[PIXMAN_OP_ADD] = rvv_combine_add_ca_float;
+ imp->combine_float_ca[PIXMAN_OP_SATURATE] = rvv_combine_saturate_ca_float;
+
+ /* Disjoint CA */
+ imp->combine_float_ca[PIXMAN_OP_DISJOINT_CLEAR] = rvv_combine_disjoint_clear_ca_float;
+ imp->combine_float_ca[PIXMAN_OP_DISJOINT_SRC] = rvv_combine_disjoint_src_ca_float;
+ imp->combine_float_ca[PIXMAN_OP_DISJOINT_DST] = rvv_combine_disjoint_dst_ca_float;
+ imp->combine_float_ca[PIXMAN_OP_DISJOINT_OVER] = rvv_combine_disjoint_over_ca_float;
+ imp->combine_float_ca[PIXMAN_OP_DISJOINT_OVER_REVERSE] = rvv_combine_disjoint_over_reverse_ca_float;
+ imp->combine_float_ca[PIXMAN_OP_DISJOINT_IN] = rvv_combine_disjoint_in_ca_float;
+ imp->combine_float_ca[PIXMAN_OP_DISJOINT_IN_REVERSE] = rvv_combine_disjoint_in_reverse_ca_float;
+ imp->combine_float_ca[PIXMAN_OP_DISJOINT_OUT] = rvv_combine_disjoint_out_ca_float;
+ imp->combine_float_ca[PIXMAN_OP_DISJOINT_OUT_REVERSE] = rvv_combine_disjoint_out_reverse_ca_float;
+ imp->combine_float_ca[PIXMAN_OP_DISJOINT_ATOP] = rvv_combine_disjoint_atop_ca_float;
+ imp->combine_float_ca[PIXMAN_OP_DISJOINT_ATOP_REVERSE] = rvv_combine_disjoint_atop_reverse_ca_float;
+ imp->combine_float_ca[PIXMAN_OP_DISJOINT_XOR] = rvv_combine_disjoint_xor_ca_float;
+
+ /* Conjoint CA */
+ imp->combine_float_ca[PIXMAN_OP_CONJOINT_CLEAR] = rvv_combine_conjoint_clear_ca_float;
+ imp->combine_float_ca[PIXMAN_OP_CONJOINT_SRC] = rvv_combine_conjoint_src_ca_float;
+ imp->combine_float_ca[PIXMAN_OP_CONJOINT_DST] = rvv_combine_conjoint_dst_ca_float;
+ imp->combine_float_ca[PIXMAN_OP_CONJOINT_OVER] = rvv_combine_conjoint_over_ca_float;
+ imp->combine_float_ca[PIXMAN_OP_CONJOINT_OVER_REVERSE] = rvv_combine_conjoint_over_reverse_ca_float;
+ imp->combine_float_ca[PIXMAN_OP_CONJOINT_IN] = rvv_combine_conjoint_in_ca_float;
+ imp->combine_float_ca[PIXMAN_OP_CONJOINT_IN_REVERSE] =rvv_combine_conjoint_in_reverse_ca_float;
+ imp->combine_float_ca[PIXMAN_OP_CONJOINT_OUT] = rvv_combine_conjoint_out_ca_float;
+ imp->combine_float_ca[PIXMAN_OP_CONJOINT_OUT_REVERSE] = rvv_combine_conjoint_out_reverse_ca_float;
+ imp->combine_float_ca[PIXMAN_OP_CONJOINT_ATOP] = rvv_combine_conjoint_atop_ca_float;
+ imp->combine_float_ca[PIXMAN_OP_CONJOINT_ATOP_REVERSE] = rvv_combine_conjoint_atop_reverse_ca_float;
+ imp->combine_float_ca[PIXMAN_OP_CONJOINT_XOR] = rvv_combine_conjoint_xor_ca_float;
+
+ /* PDF operators CA */
+ imp->combine_float_ca[PIXMAN_OP_MULTIPLY] = rvv_combine_multiply_ca_float;
+ imp->combine_float_ca[PIXMAN_OP_SCREEN] = rvv_combine_screen_ca_float;
+ imp->combine_float_ca[PIXMAN_OP_OVERLAY] = rvv_combine_overlay_ca_float;
+ imp->combine_float_ca[PIXMAN_OP_DARKEN] = rvv_combine_darken_ca_float;
+ imp->combine_float_ca[PIXMAN_OP_LIGHTEN] = rvv_combine_lighten_ca_float;
+ imp->combine_float_ca[PIXMAN_OP_COLOR_DODGE] = rvv_combine_color_dodge_ca_float;
+ imp->combine_float_ca[PIXMAN_OP_COLOR_BURN] = rvv_combine_color_burn_ca_float;
+ imp->combine_float_ca[PIXMAN_OP_HARD_LIGHT] = rvv_combine_hard_light_ca_float;
+ imp->combine_float_ca[PIXMAN_OP_SOFT_LIGHT] = rvv_combine_soft_light_ca_float;
+ imp->combine_float_ca[PIXMAN_OP_DIFFERENCE] = rvv_combine_difference_ca_float;
+ imp->combine_float_ca[PIXMAN_OP_EXCLUSION] = rvv_combine_exclusion_ca_float;
+
+ /* It is not clear that these make sense, so make them noops for now */
+ imp->combine_float_ca[PIXMAN_OP_HSL_HUE] = rvv_combine_dst_u_float;
+ imp->combine_float_ca[PIXMAN_OP_HSL_SATURATION] = rvv_combine_dst_u_float;
+ imp->combine_float_ca[PIXMAN_OP_HSL_COLOR] = rvv_combine_dst_u_float;
+ imp->combine_float_ca[PIXMAN_OP_HSL_LUMINOSITY] = rvv_combine_dst_u_float;
+
+ return imp;
+}
+
+// clang-format on
\ No newline at end of file
More information about the xorg-commit
mailing list