pixman: Branch 'master' - 2 commits

GitLab Mirror gitlab-mirror at kemper.freedesktop.org
Mon Apr 14 17:14:46 UTC 2025


 pixman/pixman-rvv.c | 2669 ++++++++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 2477 insertions(+), 192 deletions(-)

New commits:
commit f09d4f0410519d122e6486cd8b30ed5a5274ba83
Author: Marek Pikuła <m.pikula at partner.samsung.com>
Date:   Fri Apr 11 04:32:44 2025 +0200

    rvv: Add integer operations
    
    This commit introduces the implementation of most fast paths outlined in
    `pixman-fast-path.c`, along with a complete implementation of combine
    functions for integer operations.
    
    Co-Authored-By: Bernard Gingold <b.gingold at samsung.com>
    Signed-off-by: Marek Pikuła <m.pikula at partner.samsung.com>

diff --git a/pixman/pixman-rvv.c b/pixman/pixman-rvv.c
index 478fa62..570799c 100644
--- a/pixman/pixman-rvv.c
+++ b/pixman/pixman-rvv.c
@@ -2,6 +2,8 @@
  * Copyright © 2000 Keith Packard, member of The XFree86 Project, Inc.
  *             2005 Lars Knoll & Zack Rusin, Trolltech
  *             2024 Filip Wasil, Samsung Electronics
+ *             2024 Bernard Gingold, Samsung Electronics
+ *             2025 Marek Pikuła, Samsung Electronics
  * Permission to use, copy, modify, distribute, and sell this software and its
  * documentation for any purpose is hereby granted without fee, provided that
  * the above copyright notice appear in all copies and that both that
@@ -27,6 +29,8 @@
 #endif
 
 #include "pixman-combine-float.h"
+#include "pixman-combine32.h"
+#include "pixman-inlines.h"
 #include "pixman-private.h"
 
 #include <riscv_vector.h>
@@ -40,6 +44,50 @@
 #include <stdlib.h>
 #include <string.h>
 
+// Convenience macros {
+
+#define __FE_PTR(p, vl) ((p) += (vl))
+
+#define _RVV_FE_PRE(total_len, vn, vl, vspec)                                  \
+    size_t vn = total_len, vl = __riscv_vsetvl_##vspec (vn);                   \
+    vn > 0
+
+#define _RVV_FE_POST(vn, vl, vspec) vn -= (vl), vl = __riscv_vsetvl_##vspec (vn)
+
+#define RVV_FOREACH_1(total_len, vl, vspec, p1)                                \
+    for (_RVV_FE_PRE (total_len, vn, vl, vspec);                               \
+	 __FE_PTR (p1, vl), _RVV_FE_POST (vn, vl, vspec))
+
+#define RVV_FOREACH_2(total_len, vl, vspec, p1, p2)                            \
+    for (_RVV_FE_PRE (total_len, vn, vl, vspec);                               \
+	 __FE_PTR (p1, vl), __FE_PTR (p2, vl), _RVV_FE_POST (vn, vl, vspec))
+
+#define RVV_FOREACH_3(total_len, vl, vspec, p1, p2, p3)                        \
+    for (_RVV_FE_PRE (total_len, vn, vl, vspec);                               \
+	 __FE_PTR (p1, vl), __FE_PTR (p2, vl), __FE_PTR (p3, vl),              \
+	 _RVV_FE_POST (vn, vl, vspec))
+
+// vuintXXmYY_t for use in macros (less token concatenation).
+#define VUINT(ELEN, LMUL) vuint##ELEN##LMUL##_t
+#define VUINT32(LMUL)     VUINT (32, LMUL)
+#define VUINT16(LMUL)     VUINT (16, LMUL)
+#define VUINT8(LMUL)      VUINT (8, LMUL)
+
+// Short for vreinterpret commonly used for ARGB batch operations.
+#define RVV_U8x4_U32(LMUL, value)                                              \
+    __riscv_vreinterpret_v_u8##LMUL##_u32##LMUL (value)
+#define RVV_U8x4_U32_m2(value) RVV_U8x4_U32 (m2, value)
+#define RVV_U8x4_U32_m4(value) RVV_U8x4_U32 (m4, value)
+
+#define RVV_U32_U8x4(LMUL, value)                                              \
+    __riscv_vreinterpret_v_u32##LMUL##_u8##LMUL (value)
+#define RVV_U32_U8x4_m2(value) RVV_U32_U8x4 (m2, value)
+#define RVV_U32_U8x4_m4(value) RVV_U32_U8x4 (m4, value)
+
+// }
+
+// Float implementation
+
 /*
  * Screen
  *
@@ -852,135 +900,2371 @@ RVV_MAKE_SEPARABLE_PDF_COMBINERS (soft_light)
 RVV_MAKE_SEPARABLE_PDF_COMBINERS (difference)
 RVV_MAKE_SEPARABLE_PDF_COMBINERS (exclusion)
 
-static const pixman_fast_path_t rvv_fast_paths[] = {
-    {PIXMAN_OP_NONE},
-};
+// int implementation
 
-pixman_implementation_t *
-_pixman_implementation_create_rvv (pixman_implementation_t *fallback)
+// pixman-combine32.h RVV implementation plus some convenience functions {
+
+/*
+ * x_c = min(x_c + y_c, 255)
+ */
+
+#define rvv_UN8_ADD_UN8_vv(x, y, vl) __riscv_vsaddu (x, y, vl)
+
+#define rvv_UN8x4_ADD_UN8x4_vv_m4(x, y, vl)                                    \
+    RVV_U8x4_U32_m4 (rvv_UN8_ADD_UN8_vv (RVV_U32_U8x4_m4 (x),                  \
+					 RVV_U32_U8x4_m4 (y), (vl) * 4))
+
+/*
+* x_c = (x_c * a_c) / 255
+*/
+
+#define __rvv_UN8_MUL_UN8_vv(LMUL, LMUL16)                                     \
+    static force_inline VUINT8 (LMUL) rvv_UN8_MUL_UN8_vv_##LMUL (              \
+	const VUINT8 (LMUL) x, const VUINT8 (LMUL) a, size_t vl)               \
+    {                                                                          \
+	VUINT16 (LMUL16)                                                       \
+	mul_higher = __riscv_vwmaccu (                                         \
+	    __riscv_vmv_v_x_u16##LMUL16 (ONE_HALF, vl), x, a, vl);             \
+                                                                               \
+	VUINT16 (LMUL16)                                                       \
+	mul_lower = __riscv_vsrl (mul_higher, G_SHIFT, vl);                    \
+                                                                               \
+	return __riscv_vnsrl (__riscv_vadd (mul_higher, mul_lower, vl),        \
+			      G_SHIFT, vl);                                    \
+    }
+__rvv_UN8_MUL_UN8_vv (m1, m2);
+__rvv_UN8_MUL_UN8_vv (m2, m4);
+__rvv_UN8_MUL_UN8_vv (m4, m8);
+
+static force_inline vuint8m4_t
+rvv_UN8_MUL_UN8_vx_m4 (const vuint8m4_t x, const uint8_t a, size_t vl)
 {
-    pixman_implementation_t *imp = _pixman_implementation_create (
-	fallback, rvv_fast_paths);
+    vuint16m8_t mul_higher = __riscv_vwmaccu (
+	__riscv_vmv_v_x_u16m8 (ONE_HALF, vl), a, x, vl);
+    vuint16m8_t mul_lower = __riscv_vsrl (mul_higher, G_SHIFT, vl);
 
-    // clang-format off
-    imp->combine_float[PIXMAN_OP_CLEAR] = rvv_combine_clear_u_float;
-    imp->combine_float[PIXMAN_OP_SRC] = rvv_combine_src_u_float;
-    imp->combine_float[PIXMAN_OP_DST] = rvv_combine_dst_u_float;
-    imp->combine_float[PIXMAN_OP_OVER] = rvv_combine_over_u_float;
-    imp->combine_float[PIXMAN_OP_OVER_REVERSE] = rvv_combine_over_reverse_u_float;
-    imp->combine_float[PIXMAN_OP_IN] = rvv_combine_in_u_float;
-    imp->combine_float[PIXMAN_OP_IN_REVERSE] = rvv_combine_in_reverse_u_float;
-    imp->combine_float[PIXMAN_OP_OUT] = rvv_combine_out_u_float;
-    imp->combine_float[PIXMAN_OP_OUT_REVERSE] = rvv_combine_out_reverse_u_float;
-    imp->combine_float[PIXMAN_OP_ATOP] = rvv_combine_atop_u_float;
-    imp->combine_float[PIXMAN_OP_ATOP_REVERSE] = rvv_combine_atop_reverse_u_float;
-    imp->combine_float[PIXMAN_OP_XOR] = rvv_combine_xor_u_float;
-    imp->combine_float[PIXMAN_OP_ADD] = rvv_combine_add_u_float;
-    imp->combine_float[PIXMAN_OP_SATURATE] = rvv_combine_saturate_u_float;
+    return __riscv_vnsrl (__riscv_vadd (mul_higher, mul_lower, vl), G_SHIFT,
+			  vl);
+}
 
-    /* Disjoint, unified */
-    imp->combine_float[PIXMAN_OP_DISJOINT_CLEAR] = rvv_combine_disjoint_clear_u_float;
-    imp->combine_float[PIXMAN_OP_DISJOINT_SRC] = rvv_combine_disjoint_src_u_float;
-    imp->combine_float[PIXMAN_OP_DISJOINT_DST] = rvv_combine_disjoint_dst_u_float;
-    imp->combine_float[PIXMAN_OP_DISJOINT_OVER] = rvv_combine_disjoint_over_u_float;
-    imp->combine_float[PIXMAN_OP_DISJOINT_OVER_REVERSE] = rvv_combine_disjoint_over_reverse_u_float;
-    imp->combine_float[PIXMAN_OP_DISJOINT_IN] = rvv_combine_disjoint_in_u_float;
-    imp->combine_float[PIXMAN_OP_DISJOINT_IN_REVERSE] = rvv_combine_disjoint_in_reverse_u_float;
-    imp->combine_float[PIXMAN_OP_DISJOINT_OUT] = rvv_combine_disjoint_out_u_float;
-    imp->combine_float[PIXMAN_OP_DISJOINT_OUT_REVERSE] = rvv_combine_disjoint_out_reverse_u_float;
-    imp->combine_float[PIXMAN_OP_DISJOINT_ATOP] = rvv_combine_disjoint_atop_u_float;
-    imp->combine_float[PIXMAN_OP_DISJOINT_ATOP_REVERSE] = rvv_combine_disjoint_atop_reverse_u_float;
-    imp->combine_float[PIXMAN_OP_DISJOINT_XOR] = rvv_combine_disjoint_xor_u_float;
+#define __rvv_UN8x4_MUL_UN8x4_vv(LMUL, x, a, vl)                               \
+    RVV_U8x4_U32 (LMUL, rvv_UN8_MUL_UN8_vv_##LMUL (RVV_U32_U8x4 (LMUL, x),     \
+						   RVV_U32_U8x4 (LMUL, a),     \
+						   (vl) * 4))
+#define rvv_UN8x4_MUL_UN8x4_vv_m2(x, a, vl)                                    \
+    __rvv_UN8x4_MUL_UN8x4_vv (m2, x, a, vl)
+#define rvv_UN8x4_MUL_UN8x4_vv_m4(x, a, vl)                                    \
+    __rvv_UN8x4_MUL_UN8x4_vv (m4, x, a, vl)
 
-    /* Conjoint, unified */
-    imp->combine_float[PIXMAN_OP_CONJOINT_CLEAR] = rvv_combine_conjoint_clear_u_float;
-    imp->combine_float[PIXMAN_OP_CONJOINT_SRC] = rvv_combine_conjoint_src_u_float;
-    imp->combine_float[PIXMAN_OP_CONJOINT_DST] = rvv_combine_conjoint_dst_u_float;
-    imp->combine_float[PIXMAN_OP_CONJOINT_OVER] = rvv_combine_conjoint_over_u_float;
-    imp->combine_float[PIXMAN_OP_CONJOINT_OVER_REVERSE] = rvv_combine_conjoint_over_reverse_u_float;
-    imp->combine_float[PIXMAN_OP_CONJOINT_IN] = rvv_combine_conjoint_in_u_float;
-    imp->combine_float[PIXMAN_OP_CONJOINT_IN_REVERSE] = rvv_combine_conjoint_in_reverse_u_float;
-    imp->combine_float[PIXMAN_OP_CONJOINT_OUT] = rvv_combine_conjoint_out_u_float;
-    imp->combine_float[PIXMAN_OP_CONJOINT_OUT_REVERSE] = rvv_combine_conjoint_out_reverse_u_float;
-    imp->combine_float[PIXMAN_OP_CONJOINT_ATOP] = rvv_combine_conjoint_atop_u_float;
-    imp->combine_float[PIXMAN_OP_CONJOINT_ATOP_REVERSE] = rvv_combine_conjoint_atop_reverse_u_float;
-    imp->combine_float[PIXMAN_OP_CONJOINT_XOR] = rvv_combine_conjoint_xor_u_float;
+/*
+* a_c = a (broadcast to all components)
+*/
 
-    /* PDF operators, unified */
-    imp->combine_float[PIXMAN_OP_MULTIPLY] = rvv_combine_multiply_u_float;
-    imp->combine_float[PIXMAN_OP_SCREEN] = rvv_combine_screen_u_float;
-    imp->combine_float[PIXMAN_OP_OVERLAY] = rvv_combine_overlay_u_float;
-    imp->combine_float[PIXMAN_OP_DARKEN] = rvv_combine_darken_u_float;
-    imp->combine_float[PIXMAN_OP_LIGHTEN] = rvv_combine_lighten_u_float;
-    imp->combine_float[PIXMAN_OP_HARD_LIGHT] = rvv_combine_hard_light_u_float;
-    imp->combine_float[PIXMAN_OP_SOFT_LIGHT] = rvv_combine_soft_light_u_float;
-    imp->combine_float[PIXMAN_OP_DIFFERENCE] = rvv_combine_difference_u_float;
-    imp->combine_float[PIXMAN_OP_EXCLUSION] = rvv_combine_exclusion_u_float;
-    imp->combine_float[PIXMAN_OP_COLOR_DODGE] = rvv_combine_color_dodge_u_float;
-    imp->combine_float[PIXMAN_OP_COLOR_BURN] = rvv_combine_color_burn_u_float;
+#define __rvv_UN16_bcast_UN8x4_v(LMUL, LMUL16)                                 \
+    static force_inline VUINT32 (LMUL)                                         \
+	rvv_UN16_bcast_UN8x4_v_##LMUL (const VUINT16 (LMUL16) a, size_t vl)    \
+    {                                                                          \
+	VUINT32 (LMUL)                                                         \
+	a32 = __riscv_vwcvtu_x (__riscv_vmadd (a, 1 << 8, a, vl), vl);         \
+                                                                               \
+	return __riscv_vmadd (a32, 1 << 16, a32, vl);                          \
+    }
+__rvv_UN16_bcast_UN8x4_v (m2, m1);
+__rvv_UN16_bcast_UN8x4_v (m4, m2);
 
-    /* Component alpha combiners */
-    imp->combine_float_ca[PIXMAN_OP_CLEAR] = rvv_combine_clear_ca_float;
-    imp->combine_float_ca[PIXMAN_OP_SRC] = rvv_combine_src_ca_float;
-    imp->combine_float_ca[PIXMAN_OP_DST] = rvv_combine_dst_ca_float;
-    imp->combine_float_ca[PIXMAN_OP_OVER] = rvv_combine_over_ca_float;
-    imp->combine_float_ca[PIXMAN_OP_OVER_REVERSE] = rvv_combine_over_reverse_ca_float;
-    imp->combine_float_ca[PIXMAN_OP_IN] = rvv_combine_in_ca_float;
-    imp->combine_float_ca[PIXMAN_OP_IN_REVERSE] = rvv_combine_in_reverse_ca_float;
-    imp->combine_float_ca[PIXMAN_OP_OUT] = rvv_combine_out_ca_float;
-    imp->combine_float_ca[PIXMAN_OP_OUT_REVERSE] = rvv_combine_out_reverse_ca_float;
-    imp->combine_float_ca[PIXMAN_OP_ATOP] = rvv_combine_atop_ca_float;
-    imp->combine_float_ca[PIXMAN_OP_ATOP_REVERSE] = rvv_combine_atop_reverse_ca_float;
-    imp->combine_float_ca[PIXMAN_OP_XOR] = rvv_combine_xor_ca_float;
-    imp->combine_float_ca[PIXMAN_OP_ADD] = rvv_combine_add_ca_float;
-    imp->combine_float_ca[PIXMAN_OP_SATURATE] = rvv_combine_saturate_ca_float;
+#define rvv_UN8_bcast_UN8x4_v_m4(a, vl)                                        \
+    rvv_UN16_bcast_UN8x4_v_m4 (__riscv_vwcvtu_x (a, vl), vl)
 
-    /* Disjoint CA */
-    imp->combine_float_ca[PIXMAN_OP_DISJOINT_CLEAR] = rvv_combine_disjoint_clear_ca_float;
-    imp->combine_float_ca[PIXMAN_OP_DISJOINT_SRC] = rvv_combine_disjoint_src_ca_float;
-    imp->combine_float_ca[PIXMAN_OP_DISJOINT_DST] = rvv_combine_disjoint_dst_ca_float;
-    imp->combine_float_ca[PIXMAN_OP_DISJOINT_OVER] = rvv_combine_disjoint_over_ca_float;
-    imp->combine_float_ca[PIXMAN_OP_DISJOINT_OVER_REVERSE] = rvv_combine_disjoint_over_reverse_ca_float;
-    imp->combine_float_ca[PIXMAN_OP_DISJOINT_IN] = rvv_combine_disjoint_in_ca_float;
-    imp->combine_float_ca[PIXMAN_OP_DISJOINT_IN_REVERSE] = rvv_combine_disjoint_in_reverse_ca_float;
-    imp->combine_float_ca[PIXMAN_OP_DISJOINT_OUT] = rvv_combine_disjoint_out_ca_float;
-    imp->combine_float_ca[PIXMAN_OP_DISJOINT_OUT_REVERSE] = rvv_combine_disjoint_out_reverse_ca_float;
-    imp->combine_float_ca[PIXMAN_OP_DISJOINT_ATOP] = rvv_combine_disjoint_atop_ca_float;
-    imp->combine_float_ca[PIXMAN_OP_DISJOINT_ATOP_REVERSE] = rvv_combine_disjoint_atop_reverse_ca_float;
-    imp->combine_float_ca[PIXMAN_OP_DISJOINT_XOR] = rvv_combine_disjoint_xor_ca_float;
+/*
+* x_c = (x_c * a) / 255
+*/
 
-    /* Conjoint CA */
-    imp->combine_float_ca[PIXMAN_OP_CONJOINT_CLEAR] = rvv_combine_conjoint_clear_ca_float;
-    imp->combine_float_ca[PIXMAN_OP_CONJOINT_SRC] = rvv_combine_conjoint_src_ca_float;
-    imp->combine_float_ca[PIXMAN_OP_CONJOINT_DST] = rvv_combine_conjoint_dst_ca_float;
-    imp->combine_float_ca[PIXMAN_OP_CONJOINT_OVER] = rvv_combine_conjoint_over_ca_float;
-    imp->combine_float_ca[PIXMAN_OP_CONJOINT_OVER_REVERSE] = rvv_combine_conjoint_over_reverse_ca_float;
-    imp->combine_float_ca[PIXMAN_OP_CONJOINT_IN] = rvv_combine_conjoint_in_ca_float;
-    imp->combine_float_ca[PIXMAN_OP_CONJOINT_IN_REVERSE] = rvv_combine_conjoint_in_reverse_ca_float;
-    imp->combine_float_ca[PIXMAN_OP_CONJOINT_OUT] = rvv_combine_conjoint_out_ca_float;
-    imp->combine_float_ca[PIXMAN_OP_CONJOINT_OUT_REVERSE] = rvv_combine_conjoint_out_reverse_ca_float;
-    imp->combine_float_ca[PIXMAN_OP_CONJOINT_ATOP] = rvv_combine_conjoint_atop_ca_float;
-    imp->combine_float_ca[PIXMAN_OP_CONJOINT_ATOP_REVERSE] = rvv_combine_conjoint_atop_reverse_ca_float;
-    imp->combine_float_ca[PIXMAN_OP_CONJOINT_XOR] = rvv_combine_conjoint_xor_ca_float;
+#define rvv_UN8x4_MUL_UN8_vv_m4(x, a, vl)                                      \
+    rvv_UN8x4_MUL_UN8x4_vv_m4 (x, rvv_UN8_bcast_UN8x4_v_m4 (a, vl), vl)
 
-    /* PDF operators CA */
-    imp->combine_float_ca[PIXMAN_OP_MULTIPLY] = rvv_combine_multiply_ca_float;
-    imp->combine_float_ca[PIXMAN_OP_SCREEN] = rvv_combine_screen_ca_float;
-    imp->combine_float_ca[PIXMAN_OP_OVERLAY] = rvv_combine_overlay_ca_float;
-    imp->combine_float_ca[PIXMAN_OP_DARKEN] = rvv_combine_darken_ca_float;
-    imp->combine_float_ca[PIXMAN_OP_LIGHTEN] = rvv_combine_lighten_ca_float;
-    imp->combine_float_ca[PIXMAN_OP_COLOR_DODGE] = rvv_combine_color_dodge_ca_float;
-    imp->combine_float_ca[PIXMAN_OP_COLOR_BURN] = rvv_combine_color_burn_ca_float;
-    imp->combine_float_ca[PIXMAN_OP_HARD_LIGHT] = rvv_combine_hard_light_ca_float;
-    imp->combine_float_ca[PIXMAN_OP_SOFT_LIGHT] = rvv_combine_soft_light_ca_float;
-    imp->combine_float_ca[PIXMAN_OP_DIFFERENCE] = rvv_combine_difference_ca_float;
-    imp->combine_float_ca[PIXMAN_OP_EXCLUSION] = rvv_combine_exclusion_ca_float;
+#define __rvv_UN8x4_MUL_UN16_vv(LMUL, x, a, vl)                                \
+    rvv_UN8x4_MUL_UN8x4_vv_##LMUL (x, rvv_UN16_bcast_UN8x4_v_##LMUL (a, vl), vl)
+#define rvv_UN8x4_MUL_UN16_vv_m2(x, a, vl)                                     \
+    __rvv_UN8x4_MUL_UN16_vv (m2, x, a, vl)
+#define rvv_UN8x4_MUL_UN16_vv_m4(x, a, vl)                                     \
+    __rvv_UN8x4_MUL_UN16_vv (m4, x, a, vl)
 
-    /* It is not clear that these make sense, so make them noops for now */
-    imp->combine_float_ca[PIXMAN_OP_HSL_HUE] = rvv_combine_dst_u_float;
-    imp->combine_float_ca[PIXMAN_OP_HSL_SATURATION] = rvv_combine_dst_u_float;
-    imp->combine_float_ca[PIXMAN_OP_HSL_COLOR] = rvv_combine_dst_u_float;
-    imp->combine_float_ca[PIXMAN_OP_HSL_LUMINOSITY] = rvv_combine_dst_u_float;
+#define rvv_UN8x4_MUL_UN8_vx_m4(x, a, vl)                                      \
+    RVV_U8x4_U32_m4 (rvv_UN8_MUL_UN8_vx_m4 (RVV_U32_U8x4_m4 (x), a, (vl) * 4))
+
+static force_inline vuint32m2_t
+rvv_DIV_ONE_UN32m2_UN32m2_v (const vuint32m2_t x, size_t vl)
+{
+    vuint32m2_t mul_higher = __riscv_vadd (x, ONE_HALF, vl);
+    vuint32m2_t mul_lower  = __riscv_vsrl (mul_higher, G_SHIFT, vl);
+
+    return __riscv_vsrl (__riscv_vadd (mul_higher, mul_lower, vl), G_SHIFT, vl);
+}
+
+static force_inline vuint8m2_t
+rvv_DIV_ONE_UN32m8_UN8m2_v (const vuint32m8_t x, size_t vl)
+{
+    vuint32m8_t mul_higher = __riscv_vadd (x, ONE_HALF, vl);
+    vuint32m8_t mul_lower  = __riscv_vsrl (mul_higher, G_SHIFT, vl);
+
+    return __riscv_vncvt_x (
+	__riscv_vnsrl (__riscv_vadd (mul_higher, mul_lower, vl), G_SHIFT, vl),
+	vl);
+}
+
+/*
+* x_c = (x_c * a) / 255 + y_c
+*/
+
+#define rvv_UN8x4_MUL_UN16_ADD_UN8x4_vvv_m4(x, a, y, vl)                       \
+    rvv_UN8x4_ADD_UN8x4_vv_m4 (rvv_UN8x4_MUL_UN16_vv_m4 (x, a, vl), y, vl)
+
+/*
+* x_c = (x_c * a + y_c * b) / 255
+*/
+
+#define rvv_UN8x4_MUL_UN16_ADD_UN8x4_MUL_UN16_vvvv_m4(x, a, y, b, vl)          \
+    rvv_UN8x4_ADD_UN8x4_vv_m4 (rvv_UN8x4_MUL_UN16_vv_m4 (x, a, vl),            \
+			       rvv_UN8x4_MUL_UN16_vv_m4 (y, b, vl), vl)
+
+/*
+* x_c = (x_c * a_c) / 255 + y_c
+*/
+
+#define rvv_UN8x4_MUL_UN8x4_ADD_UN8x4_vvv_m4(x, a, y, vl)                      \
+    rvv_UN8x4_ADD_UN8x4_vv_m4 (rvv_UN8x4_MUL_UN8x4_vv_m4 (x, a, vl), y, vl)
+
+/*
+* x_c = (x_c * a_c + y_c * b) / 255
+*/
+
+#define rvv_UN8x4_MUL_UN8x4_ADD_UN8x4_MUL_UN16_vvvv_m4(x, a, y, b, vl)         \
+    rvv_UN8x4_ADD_UN8x4_vv_m4 (rvv_UN8x4_MUL_UN8x4_vv_m4 (x, a, vl),           \
+			       rvv_UN8x4_MUL_UN16_vv_m4 (y, b, vl), vl)
+
+// } pixman-combine32.h
+
+// Additional functions.
+
+#define rvv_shift_alpha_u16(x, vl) __riscv_vnsrl (x, 24, vl)
+
+#define rvv_shift_not_alpha_u16(x, vl)                                         \
+    rvv_shift_alpha_u16 (__riscv_vnot (x, vl), vl)
+
+#define rvv_load_alpha_u8m1(src, vl)                                           \
+    __riscv_vlse8_v_u8m1 ((uint8_t *)src + 3, 4, vl)
+
+#define rvv_load_not_alpha_u8m1(src, vl)                                       \
+    __riscv_vnot (rvv_load_alpha_u8m1 (src, vl), vl)
+
+#define rvv_u8m2_to_i16m4(in, vl)                                              \
+    __riscv_vreinterpret_i16m4 (__riscv_vwcvtu_x (in, vl))
+
+#define rvv_over_m4(src, dest, vl)                                             \
+    rvv_UN8x4_MUL_UN16_ADD_UN8x4_vvv_m4 (                                      \
+	dest, rvv_shift_not_alpha_u16 (src, vl), src, vl)
+
+#define rvv_in_m4(x, y, vl) rvv_UN8x4_MUL_UN8_vv_m4 (x, y, vl)
+
+#define rvv_in_load_s_m_m4(src, mask, vl)                                      \
+    rvv_in_m4 (__riscv_vle32_v_u32m4 (src, vl),                                \
+	       rvv_load_alpha_u8m1 (mask, vl), vl)
+
+#define rvv_in_load_s_nm_m4(src, mask, vl)                                     \
+    rvv_in_m4 (__riscv_vle32_v_u32m4 (src, vl),                                \
+	       rvv_load_not_alpha_u8m1 (mask, vl), vl)
+
+static force_inline vuint16m2_t
+rvv_convert_8888_to_0565_m2 (const vuint32m4_t s, size_t vl)
+{
+    vuint32m4_t rb = __riscv_vand (s, 0xF800F8, vl);
+
+    return __riscv_vor (
+	__riscv_vor (__riscv_vnsrl (rb, 3, vl), __riscv_vnsrl (rb, 8, vl), vl),
+	__riscv_vand (__riscv_vnsrl (s, 5, vl), 0x7E0, vl), vl);
+}
+
+static force_inline vuint32m4_t
+rvv_convert_0565_to_0888_m4 (const vuint16m2_t s, size_t vl)
+{
+    vuint8m1_t  g1, g2;
+    vuint16m2_t r, g_w, b;
+    vuint32m4_t r_w, rb_w;
+
+    r    = __riscv_vand (s, 0xF800, vl);
+    b    = __riscv_vand (s, 0x001F, vl);
+    r_w  = __riscv_vwmulu (r, 1 << 8, vl);
+    rb_w = __riscv_vwmaccu (r_w, 1 << 3, b, vl);
+    rb_w = __riscv_vand (__riscv_vor (rb_w, __riscv_vsrl (rb_w, 5, vl), vl),
+			 0xFF00FF, vl);
+
+    g1  = __riscv_vsll (__riscv_vnsrl (s, 5, vl), 2, vl);
+    g2  = __riscv_vsrl (g1, 6, vl);
+    g_w = __riscv_vwaddu_vv (g1, g2, vl);
+
+    return __riscv_vwmaccu (rb_w, 1 << 8, g_w, vl);
+}
+
+#define rvv_convert_0565_to_8888_m4(s, vl)                                     \
+    __riscv_vor (rvv_convert_0565_to_0888_m4 (s, vl), 0xff000000, vl)
+
+#define __rvv_combine_mask_value_ca(LMUL, src, mask, vl)                       \
+    rvv_UN8x4_MUL_UN8x4_vv_##LMUL (src, mask, vl)
+#define rvv_combine_mask_value_ca_m2(src, mask, vl)                            \
+    __rvv_combine_mask_value_ca (m2, src, mask, vl)
+#define rvv_combine_mask_value_ca_m4(src, mask, vl)                            \
+    __rvv_combine_mask_value_ca (m4, src, mask, vl)
+
+#define __rvv_combine_mask_alpha_ca(LMUL, src, mask, vl)                       \
+    rvv_UN8x4_MUL_UN16_vv_##LMUL (mask, rvv_shift_alpha_u16 (src, vl), vl)
+#define rvv_combine_mask_alpha_ca_m2(src, mask, vl)                            \
+    __rvv_combine_mask_alpha_ca (m2, src, mask, vl)
+#define rvv_combine_mask_alpha_ca_m4(src, mask, vl)                            \
+    __rvv_combine_mask_alpha_ca (m4, src, mask, vl)
+
+#define __rvv_combine_mask(LMUL, src, mask, vl)                                \
+    rvv_UN8x4_MUL_UN16_vv_##LMUL (src, rvv_shift_alpha_u16 (mask, vl), vl)
+#define rvv_combine_mask_m2(src, mask, vl)                                     \
+    __rvv_combine_mask (m2, src, mask, vl)
+#define rvv_combine_mask_m4(src, mask, vl)                                     \
+    __rvv_combine_mask (m4, src, mask, vl)
+
+#define __rvv_combine_mask_ca(LMUL)                                            \
+    static force_inline void rvv_combine_mask_ca_##LMUL (                      \
+	VUINT32 (LMUL) *__restrict__ src, VUINT32 (LMUL) *__restrict__ mask,   \
+	size_t vl)                                                             \
+    {                                                                          \
+	VUINT32 (LMUL) src_cpy = *src;                                         \
+	*(src)  = rvv_combine_mask_value_ca_##LMUL (*(src), *(mask), vl);      \
+	*(mask) = rvv_combine_mask_alpha_ca_##LMUL (src_cpy, *(mask), vl);     \
+    }
+__rvv_combine_mask_ca (m2);
+__rvv_combine_mask_ca (m4);
+
+static void
+rvv_combine_clear (pixman_implementation_t *__restrict__ imp,
+		   pixman_op_t op,
+		   uint32_t *__restrict__ dest,
+		   const uint32_t *__restrict__ src,
+		   const uint32_t *__restrict__ mask,
+		   int width)
+{
+    uint32_t *pd = dest;
+
+    vuint32m8_t v = __riscv_vmv_v_x_u32m8 (0, __riscv_vsetvlmax_e32m8 ());
+    RVV_FOREACH_1 (width, vl, e32m8, pd) { __riscv_vse32 (pd, v, vl); }
+}
+
+static void
+rvv_combine_src_u (pixman_implementation_t *__restrict__ imp,
+		   pixman_op_t op,
+		   uint32_t *__restrict__ dest,
+		   const uint32_t *__restrict__ src,
+		   const uint32_t *__restrict__ mask,
+		   int width)
+{
+    uint32_t *__restrict__ pd       = dest;
+    const uint32_t *__restrict__ ps = src;
+    const uint32_t *__restrict__ pm = mask;
+
+    if (mask)
+    {
+	RVV_FOREACH_3 (width, vl, e32m4, ps, pm, pd)
+	{
+	    __riscv_vse32 (pd, rvv_in_load_s_m_m4 (ps, pm, vl), vl);
+	}
+    }
+    else
+    {
+	RVV_FOREACH_2 (width, vl, e32m8, ps, pd)
+	{
+	    __riscv_vse32 (pd, __riscv_vle32_v_u32m8 (ps, vl), vl);
+	}
+    }
+}
+
+static void
+rvv_combine_over_u (pixman_implementation_t *__restrict__ imp,
+		    pixman_op_t op,
+		    uint32_t *__restrict__ dest,
+		    const uint32_t *__restrict__ src,
+		    const uint32_t *__restrict__ mask,
+		    int width)
+{
+    uint32_t *__restrict__ pd       = dest;
+    const uint32_t *__restrict__ ps = src;
+    const uint32_t *__restrict__ pm = mask;
+
+    if (mask)
+    {
+	RVV_FOREACH_3 (width, vl, e32m4, ps, pm, pd)
+	{
+	    __riscv_vse32 (pd,
+			   rvv_over_m4 (rvv_in_load_s_m_m4 (ps, pm, vl),
+					__riscv_vle32_v_u32m4 (pd, vl), vl),
+			   vl);
+	}
+    }
+    else
+    {
+	RVV_FOREACH_2 (width, vl, e32m4, ps, pd)
+	{
+	    __riscv_vse32 (pd,
+			   rvv_over_m4 (__riscv_vle32_v_u32m4 (ps, vl),
+					__riscv_vle32_v_u32m4 (pd, vl), vl),
+			   vl);
+	}
+    }
+}
+
+static void
+rvv_combine_over_reverse_u (pixman_implementation_t *__restrict__ imp,
+			    pixman_op_t op,
+			    uint32_t *__restrict__ dest,
+			    const uint32_t *__restrict__ src,
+			    const uint32_t *__restrict__ mask,
+			    int width)
+{
+    uint32_t *__restrict__ pd       = dest;
+    const uint32_t *__restrict__ ps = src;
+    const uint32_t *__restrict__ pm = mask;
+
+    if (mask)
+    {
+	RVV_FOREACH_3 (width, vl, e32m4, ps, pm, pd)
+	{
+	    __riscv_vse32 (pd,
+			   rvv_over_m4 (__riscv_vle32_v_u32m4 (pd, vl),
+					rvv_in_load_s_m_m4 (ps, pm, vl), vl),
+			   vl);
+	}
+    }
+    else
+    {
+	RVV_FOREACH_2 (width, vl, e32m4, ps, pd)
+	{
+	    __riscv_vse32 (pd,
+			   rvv_over_m4 (__riscv_vle32_v_u32m4 (pd, vl),
+					__riscv_vle32_v_u32m4 (ps, vl), vl),
+			   vl);
+	}
+    }
+}
+
+static void
+rvv_combine_in_u (pixman_implementation_t *__restrict__ imp,
+		  pixman_op_t op,
+		  uint32_t *__restrict__ dest,
+		  const uint32_t *__restrict__ src,
+		  const uint32_t *__restrict__ mask,
+		  int width)
+{
+    uint32_t *__restrict__ pd       = dest;
+    const uint32_t *__restrict__ ps = src;
+    const uint32_t *__restrict__ pm = mask;
+
+    if (mask)
+    {
+	RVV_FOREACH_3 (width, vl, e32m4, ps, pm, pd)
+	{
+	    __riscv_vse32 (pd,
+			   rvv_in_m4 (rvv_in_load_s_m_m4 (ps, pm, vl),
+				      rvv_load_alpha_u8m1 (pd, vl), vl),
+			   vl);
+	}
+    }
+    else
+    {
+	RVV_FOREACH_2 (width, vl, e32m4, ps, pd)
+	{
+	    __riscv_vse32 (pd, rvv_in_load_s_m_m4 (ps, pd, vl), vl);
+	}
+    }
+}
+
+static void
+rvv_combine_in_reverse_u (pixman_implementation_t *__restrict__ imp,
+			  pixman_op_t op,
+			  uint32_t *__restrict__ dest,
+			  const uint32_t *__restrict__ src,
+			  const uint32_t *__restrict__ mask,
+			  int width)
+{
+    uint32_t *__restrict__ pd       = dest;
+    const uint32_t *__restrict__ ps = src;
+    const uint32_t *__restrict__ pm = mask;
+
+    if (mask)
+    {
+	RVV_FOREACH_3 (width, vl, e32m4, ps, pm, pd)
+	{
+	    __riscv_vse32 (pd,
+			   rvv_in_m4 (__riscv_vle32_v_u32m4 (pd, vl),
+				      rvv_UN8_MUL_UN8_vv_m1 (
+					  rvv_load_alpha_u8m1 (ps, vl),
+					  rvv_load_alpha_u8m1 (pm, vl), vl),
+				      vl),
+			   vl);
+	}
+    }
+    else
+    {
+	RVV_FOREACH_2 (width, vl, e32m4, ps, pd)
+	{
+	    __riscv_vse32 (pd, rvv_in_load_s_m_m4 (pd, ps, vl), vl);
+	}
+    }
+}
+
+static void
+rvv_combine_out_u (pixman_implementation_t *__restrict__ imp,
+		   pixman_op_t op,
+		   uint32_t *__restrict__ dest,
+		   const uint32_t *__restrict__ src,
+		   const uint32_t *__restrict__ mask,
+		   int width)
+{
+    uint32_t *__restrict__ pd       = dest;
+    const uint32_t *__restrict__ ps = src;
+    const uint32_t *__restrict__ pm = mask;
+
+    if (mask)
+    {
+	RVV_FOREACH_3 (width, vl, e32m4, ps, pm, pd)
+	{
+	    __riscv_vse32 (pd,
+			   rvv_in_m4 (rvv_in_load_s_m_m4 (ps, pm, vl),
+				      rvv_load_not_alpha_u8m1 (pd, vl), vl),
+			   vl);
+	}
+    }
+    else
+    {
+	RVV_FOREACH_2 (width, vl, e32m4, ps, pd)
+	{
+	    __riscv_vse32 (pd, rvv_in_load_s_nm_m4 (ps, pd, vl), vl);
+	}
+    }
+}
+
+static void
+rvv_combine_out_reverse_u (pixman_implementation_t *__restrict__ imp,
+			   pixman_op_t op,
+			   uint32_t *__restrict__ dest,
+			   const uint32_t *__restrict__ src,
+			   const uint32_t *__restrict__ mask,
+			   int width)
+{
+    uint32_t *__restrict__ pd       = dest;
+    const uint32_t *__restrict__ ps = src;
+    const uint32_t *__restrict__ pm = mask;
+
+    if (mask)
+    {
+	RVV_FOREACH_3 (width, vl, e32m4, ps, pm, pd)
+	{
+	    __riscv_vse32 (
+		pd,
+		rvv_in_m4 (__riscv_vle32_v_u32m4 (pd, vl),
+			   __riscv_vnot (rvv_UN8_MUL_UN8_vv_m1 (
+					     rvv_load_alpha_u8m1 (ps, vl),
+					     rvv_load_alpha_u8m1 (pm, vl), vl),
+					 vl),
+			   vl),
+		vl);
+	}
+    }
+    else
+    {
+	RVV_FOREACH_2 (width, vl, e32m4, ps, pd)
+	{
+	    __riscv_vse32 (pd, rvv_in_load_s_nm_m4 (pd, ps, vl), vl);
+	}
+    }
+}
+
+static void
+rvv_combine_atop_u (pixman_implementation_t *__restrict__ imp,
+		    pixman_op_t op,
+		    uint32_t *__restrict__ dest,
+		    const uint32_t *__restrict__ src,
+		    const uint32_t *__restrict__ mask,
+		    int width)
+{
+    uint32_t *__restrict__ pd       = dest;
+    const uint32_t *__restrict__ ps = src;
+    const uint32_t *__restrict__ pm = mask;
+    vuint32m4_t s, d;
+
+    if (mask)
+    {
+	RVV_FOREACH_3 (width, vl, e32m4, ps, pm, pd)
+	{
+	    s = rvv_in_load_s_m_m4 (ps, pm, vl);
+	    d = __riscv_vle32_v_u32m4 (pd, vl);
+	    __riscv_vse32 (pd,
+			   rvv_UN8x4_MUL_UN16_ADD_UN8x4_MUL_UN16_vvvv_m4 (
+			       s, rvv_shift_alpha_u16 (d, vl), d,
+			       rvv_shift_not_alpha_u16 (s, vl), vl),
+			   vl);
+	}
+    }
+    else
+    {
+	RVV_FOREACH_2 (width, vl, e32m4, ps, pd)
+	{
+	    s = __riscv_vle32_v_u32m4 (ps, vl);
+	    d = __riscv_vle32_v_u32m4 (pd, vl);
+	    __riscv_vse32 (pd,
+			   rvv_UN8x4_MUL_UN16_ADD_UN8x4_MUL_UN16_vvvv_m4 (
+			       s, rvv_shift_alpha_u16 (d, vl), d,
+			       rvv_shift_not_alpha_u16 (s, vl), vl),
+			   vl);
+	}
+    }
+}
+
+static void
+rvv_combine_atop_reverse_u (pixman_implementation_t *__restrict__ imp,
+			    pixman_op_t op,
+			    uint32_t *__restrict__ dest,
+			    const uint32_t *__restrict__ src,
+			    const uint32_t *__restrict__ mask,
+			    int width)
+{
+    uint32_t *__restrict__ pd       = dest;
+    const uint32_t *__restrict__ ps = src;
+    const uint32_t *__restrict__ pm = mask;
+    vuint32m4_t s, d;
+
+    if (mask)
+    {
+	RVV_FOREACH_3 (width, vl, e32m4, ps, pm, pd)
+	{
+	    s = rvv_in_load_s_m_m4 (ps, pm, vl);
+	    d = __riscv_vle32_v_u32m4 (pd, vl);
+	    __riscv_vse32 (pd,
+			   rvv_UN8x4_MUL_UN16_ADD_UN8x4_MUL_UN16_vvvv_m4 (
+			       s, rvv_shift_not_alpha_u16 (d, vl), d,
+			       rvv_shift_alpha_u16 (s, vl), vl),
+			   vl);
+	}
+    }
+    else
+    {
+	RVV_FOREACH_2 (width, vl, e32m4, ps, pd)
+	{
+	    s = __riscv_vle32_v_u32m4 (ps, vl);
+	    d = __riscv_vle32_v_u32m4 (pd, vl);
+	    __riscv_vse32 (pd,
+			   rvv_UN8x4_MUL_UN16_ADD_UN8x4_MUL_UN16_vvvv_m4 (
+			       s, rvv_shift_not_alpha_u16 (d, vl), d,
+			       rvv_shift_alpha_u16 (s, vl), vl),
+			   vl);
+	}
+    }
+}
+
+static void
+rvv_combine_xor_u (pixman_implementation_t *__restrict__ imp,
+		   pixman_op_t op,
+		   uint32_t *__restrict__ dest,
+		   const uint32_t *__restrict__ src,
+		   const uint32_t *__restrict__ mask,
+		   int width)
+{
+    uint32_t *__restrict__ pd       = dest;
+    const uint32_t *__restrict__ ps = src;
+    const uint32_t *__restrict__ pm = mask;
+    vuint32m4_t s, d;
+
+    if (mask)
+    {
+	RVV_FOREACH_3 (width, vl, e32m4, ps, pm, pd)
+	{
+	    s = rvv_in_load_s_m_m4 (ps, pm, vl);
+	    d = __riscv_vle32_v_u32m4 (pd, vl);
+	    __riscv_vse32 (pd,
+			   rvv_UN8x4_MUL_UN16_ADD_UN8x4_MUL_UN16_vvvv_m4 (
+			       s, rvv_shift_not_alpha_u16 (d, vl), d,
+			       rvv_shift_not_alpha_u16 (s, vl), vl),
+			   vl);
+	}
+    }
+    else
+    {
+	RVV_FOREACH_2 (width, vl, e32m4, ps, pd)
+	{
+	    s = __riscv_vle32_v_u32m4 (ps, vl);
+	    d = __riscv_vle32_v_u32m4 (pd, vl);
+	    __riscv_vse32 (pd,
+			   rvv_UN8x4_MUL_UN16_ADD_UN8x4_MUL_UN16_vvvv_m4 (
+			       s, rvv_shift_not_alpha_u16 (d, vl), d,
+			       rvv_shift_not_alpha_u16 (s, vl), vl),
+			   vl);
+	}
+    }
+}
+
+static void
+rvv_combine_add_u (pixman_implementation_t *__restrict__ imp,
+		   pixman_op_t op,
+		   uint32_t *__restrict__ dest,
+		   const uint32_t *__restrict__ src,
+		   const uint32_t *__restrict__ mask,
+		   int width)
+{
+    uint32_t *__restrict__ pd       = dest;
+    const uint32_t *__restrict__ ps = src;
+    const uint32_t *__restrict__ pm = mask;
+
+    if (mask)
+    {
+	RVV_FOREACH_3 (width, vl, e32m4, ps, pm, pd)
+	{
+	    __riscv_vse32 (
+		pd,
+		rvv_UN8x4_ADD_UN8x4_vv_m4 (__riscv_vle32_v_u32m4 (pd, vl),
+					   rvv_in_load_s_m_m4 (ps, pm, vl), vl),
+		vl);
+	}
+    }
+    else
+    {
+	RVV_FOREACH_2 (width, vl, e32m4, ps, pd)
+	{
+	    __riscv_vse32 (
+		pd,
+		rvv_UN8x4_ADD_UN8x4_vv_m4 (__riscv_vle32_v_u32m4 (pd, vl),
+					   __riscv_vle32_v_u32m4 (ps, vl), vl),
+		vl);
+	}
+    }
+}
+
+/*
+ * Multiply
+ *
+ *      ad * as * B(d / ad, s / as)
+ *    = ad * as * d/ad * s/as
+ *    = d * s
+ *
+ */
+static void
+rvv_combine_multiply_u (pixman_implementation_t *imp,
+			pixman_op_t              op,
+			uint32_t *__restrict__ dest,
+			const uint32_t *__restrict__ src,
+			const uint32_t *__restrict__ mask,
+			int width)
+{
+    uint32_t *__restrict__ pd       = dest;
+    const uint32_t *__restrict__ ps = src;
+    const uint32_t *__restrict__ pm = mask;
+
+    vuint32m4_t s, d;
+    if (mask)
+    {
+	RVV_FOREACH_3 (width, vl, e32m4, ps, pm, pd)
+	{
+	    s = rvv_in_load_s_m_m4 (ps, pm, vl);
+	    d = __riscv_vle32_v_u32m4 (pd, vl);
+
+	    __riscv_vse32 (pd,
+			   rvv_UN8x4_ADD_UN8x4_vv_m4 (
+			       rvv_UN8x4_MUL_UN8x4_vv_m4 (d, s, vl),
+			       rvv_UN8x4_MUL_UN16_ADD_UN8x4_MUL_UN16_vvvv_m4 (
+				   s, rvv_shift_not_alpha_u16 (d, vl), d,
+				   rvv_shift_not_alpha_u16 (s, vl), vl),
+			       vl),
+			   vl);
+	}
+    }
+    else
+    {
+	RVV_FOREACH_3 (width, vl, e32m4, ps, pm, pd)
+	{
+	    s = __riscv_vle32_v_u32m4 (ps, vl);
+	    d = __riscv_vle32_v_u32m4 (pd, vl);
+
+	    __riscv_vse32 (pd,
+			   rvv_UN8x4_ADD_UN8x4_vv_m4 (
+			       rvv_UN8x4_MUL_UN8x4_vv_m4 (d, s, vl),
+			       rvv_UN8x4_MUL_UN16_ADD_UN8x4_MUL_UN16_vvvv_m4 (
+				   s, rvv_shift_not_alpha_u16 (d, vl), d,
+				   rvv_shift_not_alpha_u16 (s, vl), vl),
+			       vl),
+			   vl);
+	}
+    }
+}
+
+static void
+rvv_combine_multiply_ca (pixman_implementation_t *__restrict__ imp,
+			 pixman_op_t op,
+			 uint32_t *__restrict__ dest,
+			 const uint32_t *__restrict__ src,
+			 const uint32_t *__restrict__ mask,
+			 int width)
+{
+    uint32_t *__restrict__ pd       = dest;
+    const uint32_t *__restrict__ ps = src;
+    const uint32_t *__restrict__ pm = mask;
+
+    vuint32m4_t s, m, d;
+
+    RVV_FOREACH_3 (width, vl, e32m4, ps, pm, pd)
+    {
+	s = __riscv_vle32_v_u32m4 (ps, vl);
+	m = __riscv_vle32_v_u32m4 (pm, vl);
+	rvv_combine_mask_ca_m4 (&s, &m, vl);
+
+	d = __riscv_vle32_v_u32m4 (pd, vl);
+
+	__riscv_vse32 (pd,
+		       rvv_UN8x4_ADD_UN8x4_vv_m4 (
+			   rvv_UN8x4_MUL_UN8x4_ADD_UN8x4_MUL_UN16_vvvv_m4 (
+			       d, __riscv_vnot (m, vl), s,
+			       rvv_shift_not_alpha_u16 (d, vl), vl),
+			   rvv_UN8x4_MUL_UN8x4_vv_m4 (d, s, vl), vl),
+		       vl);
+    }
+}
+
+#define PDF_SEPARABLE_BLEND_MODE(name)                                         \
+    static void rvv_combine_##name##_u (                                       \
+	pixman_implementation_t *imp, pixman_op_t op, uint32_t *dest,          \
+	const uint32_t *src, const uint32_t *mask, int width)                  \
+    {                                                                          \
+	uint32_t *__restrict__ pd       = dest;                                \
+	const uint32_t *__restrict__ ps = src;                                 \
+	const uint32_t *__restrict__ pm = mask;                                \
+                                                                               \
+	vuint32m2_t s, d, ra, rx;                                              \
+	vuint16m1_t da, sa;                                                    \
+	size_t      vl4;                                                       \
+	vuint8m2_t  s4, d4, sa4, isa4, da4, ida4;                              \
+	vuint32m8_t rx4;                                                       \
+                                                                               \
+	RVV_FOREACH_3 (width, vl, e32m2, ps, pm, pd)                           \
+	{                                                                      \
+	    vl4 = vl * 4;                                                      \
+                                                                               \
+	    s = __riscv_vle32_v_u32m2 (ps, vl);                                \
+	    if (mask)                                                          \
+		s = rvv_combine_mask_m2 (s, __riscv_vle32_v_u32m2 (pm, vl),    \
+					 vl);                                  \
+	    sa = rvv_shift_alpha_u16 (s, vl);                                  \
+                                                                               \
+	    d  = __riscv_vle32_v_u32m2 (pd, vl);                               \
+	    da = rvv_shift_alpha_u16 (d, vl);                                  \
+                                                                               \
+	    ra = __riscv_vsub (__riscv_vwaddu_vv (__riscv_vmul (da, 0xFF, vl), \
+						  __riscv_vmul (sa, 0xFF, vl), \
+						  vl),                         \
+			       __riscv_vwmulu (sa, da, vl), vl);               \
+                                                                               \
+	    s4   = RVV_U32_U8x4_m2 (s);                                        \
+	    sa4  = RVV_U32_U8x4_m2 (rvv_UN16_bcast_UN8x4_v_m2 (sa, vl));       \
+	    isa4 = __riscv_vnot (sa4, vl4);                                    \
+	    d4   = RVV_U32_U8x4_m2 (d);                                        \
+	    da4  = RVV_U32_U8x4_m2 (rvv_UN16_bcast_UN8x4_v_m2 (da, vl));       \
+	    ida4 = __riscv_vnot (da4, vl4);                                    \
+                                                                               \
+	    rx4 = __riscv_vadd (                                               \
+		__riscv_vwaddu_vv (__riscv_vwmulu (isa4, d4, vl4),             \
+				   __riscv_vwmulu (ida4, s4, vl4), vl4),       \
+		rvv_blend_##name##_int (d4, da4, s4, sa4, vl4), vl4);          \
+                                                                               \
+	    ra  = __riscv_vminu (ra, 255 * 255, vl);                           \
+	    rx4 = __riscv_vminu (rx4, 255 * 255, vl4);                         \
+                                                                               \
+	    ra = rvv_DIV_ONE_UN32m2_UN32m2_v (ra, vl);                         \
+	    rx = RVV_U8x4_U32_m2 (rvv_DIV_ONE_UN32m8_UN8m2_v (rx4, vl4));      \
+                                                                               \
+	    __riscv_vse32 (pd,                                                 \
+			   __riscv_vor (__riscv_vsll (ra, 24, vl),             \
+					__riscv_vand (rx, 0x00FFFFFF, vl),     \
+					vl),                                   \
+			   vl);                                                \
+	}                                                                      \
+    }                                                                          \
+                                                                               \
+    static void rvv_combine_##name##_ca (                                      \
+	pixman_implementation_t *imp, pixman_op_t op, uint32_t *dest,          \
+	const uint32_t *src, const uint32_t *mask, int width)                  \
+    {                                                                          \
+	uint32_t *__restrict__ pd       = dest;                                \
+	const uint32_t *__restrict__ ps = src;                                 \
+	const uint32_t *__restrict__ pm = mask;                                \
+                                                                               \
+	vuint32m2_t s, m, d, ra, rx;                                           \
+	vuint16m1_t da, sa;                                                    \
+	size_t      vl4;                                                       \
+	vuint8m2_t  s4, m4, d4, ixa4, da4, ida4;                               \
+	vuint32m8_t rx4;                                                       \
+                                                                               \
+	RVV_FOREACH_3 (width, vl, e32m2, ps, pm, pd)                           \
+	{                                                                      \
+	    m = __riscv_vle32_v_u32m2 (pm, vl);                                \
+	    s = __riscv_vle32_v_u32m2 (ps, vl);                                \
+	    rvv_combine_mask_ca_m2 (&s, &m, vl);                               \
+	    sa = rvv_shift_alpha_u16 (s, vl);                                  \
+                                                                               \
+	    d  = __riscv_vle32_v_u32m2 (pd, vl);                               \
+	    da = rvv_shift_alpha_u16 (d, vl);                                  \
+                                                                               \
+	    ra = __riscv_vsub (__riscv_vwaddu_vv (__riscv_vmul (da, 0xFF, vl), \
+						  __riscv_vmul (sa, 0xFF, vl), \
+						  vl),                         \
+			       __riscv_vwmulu (sa, da, vl), vl);               \
+                                                                               \
+	    ixa4 = RVV_U32_U8x4_m2 (__riscv_vnot (m, vl));                     \
+	    d4   = RVV_U32_U8x4_m2 (d);                                        \
+	    ida4 = RVV_U32_U8x4_m2 (                                           \
+		__riscv_vnot (rvv_UN16_bcast_UN8x4_v_m2 (da, vl), vl));        \
+	    s4  = RVV_U32_U8x4_m2 (s);                                         \
+	    da4 = RVV_U32_U8x4_m2 (rvv_UN16_bcast_UN8x4_v_m2 (da, vl));        \
+	    m4  = RVV_U32_U8x4_m2 (m);                                         \
+                                                                               \
+	    vl4 = vl * 4;                                                      \
+	    rx4 = __riscv_vadd (                                               \
+		__riscv_vwaddu_vv (__riscv_vwmulu (ixa4, d4, vl4),             \
+				   __riscv_vwmulu (ida4, s4, vl4), vl4),       \
+		rvv_blend_##name##_int (d4, da4, s4, m4, vl4), vl4);           \
+                                                                               \
+	    ra  = __riscv_vminu (ra, 255 * 255, vl);                           \
+	    rx4 = __riscv_vminu (rx4, 255 * 255, vl4);                         \
+                                                                               \
+	    ra = rvv_DIV_ONE_UN32m2_UN32m2_v (ra, vl);                         \
+	    rx = RVV_U8x4_U32_m2 (rvv_DIV_ONE_UN32m8_UN8m2_v (rx4, vl4));      \
+                                                                               \
+	    __riscv_vse32 (pd,                                                 \
+			   __riscv_vor (__riscv_vsll (ra, 24, vl),             \
+					__riscv_vand (rx, 0x00FFFFFF, vl),     \
+					vl),                                   \
+			   vl);                                                \
+	}                                                                      \
+    }
+
+static force_inline vuint32m8_t
+rvv_blend_screen_int (const vuint8m2_t d,
+		      const vuint8m2_t ad,
+		      const vuint8m2_t s,
+		      const vuint8m2_t as,
+		      size_t           vl)
+{
+    return __riscv_vsub (__riscv_vwaddu_vv (__riscv_vwmulu (s, ad, vl),
+					    __riscv_vwmulu (d, as, vl), vl),
+			 __riscv_vwcvtu_x (__riscv_vwmulu (s, d, vl), vl), vl);
+}
+
+PDF_SEPARABLE_BLEND_MODE (screen)
+
+static force_inline vuint32m8_t
+_rvv_blend_overlay_hard_light (const vuint8m2_t d,
+			       const vuint8m2_t ad,
+			       const vuint8m2_t s,
+			       const vuint8m2_t as,
+			       const vbool4_t   selector,
+			       size_t           vl)
+{
+    vuint32m8_t out_true = __riscv_vwmulu (__riscv_vwmulu (s, d, vl), 2, vl);
+
+    vint16m4_t d_i  = rvv_u8m2_to_i16m4 (d, vl);
+    vint16m4_t ad_i = rvv_u8m2_to_i16m4 (ad, vl);
+    vint16m4_t s_i  = rvv_u8m2_to_i16m4 (s, vl);
+    vint16m4_t as_i = rvv_u8m2_to_i16m4 (as, vl);
+
+    vuint32m8_t out_false = __riscv_vreinterpret_v_i32m8_u32m8 (__riscv_vsub (
+	__riscv_vwmul (as_i, ad_i, vl),
+	__riscv_vsll (__riscv_vwmul (__riscv_vsub (ad_i, d_i, vl),
+				     __riscv_vsub (as_i, s_i, vl), vl),
+		      1, vl),
+	vl));
+
+    return __riscv_vmerge (out_false, out_true, selector, vl);
+}
+
+static force_inline vuint32m8_t
+rvv_blend_overlay_int (const vuint8m2_t d,
+		       const vuint8m2_t ad,
+		       const vuint8m2_t s,
+		       const vuint8m2_t as,
+		       size_t           vl)
+{
+    return _rvv_blend_overlay_hard_light (
+	d, ad, s, as,
+	__riscv_vmsltu (__riscv_vwmulu (d, 2, vl), __riscv_vwcvtu_x (ad, vl),
+			vl),
+	vl);
+}
+
+PDF_SEPARABLE_BLEND_MODE (overlay)
+
+static force_inline vuint32m8_t
+rvv_blend_darken_int (const vuint8m2_t d,
+		      const vuint8m2_t ad,
+		      const vuint8m2_t s,
+		      const vuint8m2_t as,
+		      size_t           vl)
+{
+    return __riscv_vwcvtu_x (__riscv_vminu (__riscv_vwmulu (ad, s, vl),
+					    __riscv_vwmulu (as, d, vl), vl),
+			     vl);
+}
+
+PDF_SEPARABLE_BLEND_MODE (darken)
+
+static force_inline vuint32m8_t
+rvv_blend_lighten_int (const vuint8m2_t d,
+		       const vuint8m2_t ad,
+		       const vuint8m2_t s,
+		       const vuint8m2_t as,
+		       size_t           vl)
+{
+    return __riscv_vwcvtu_x (__riscv_vmaxu (__riscv_vwmulu (as, d, vl),
+					    __riscv_vwmulu (ad, s, vl), vl),
+			     vl);
+}
+
+PDF_SEPARABLE_BLEND_MODE (lighten)
+
+static force_inline vuint32m8_t
+rvv_blend_hard_light_int (const vuint8m2_t d,
+			  const vuint8m2_t ad,
+			  const vuint8m2_t s,
+			  const vuint8m2_t as,
+			  size_t           vl)
+{
+    return _rvv_blend_overlay_hard_light (
+	d, ad, s, as,
+	__riscv_vmsltu (__riscv_vwmulu (s, 2, vl), __riscv_vwcvtu_x (as, vl),
+			vl),
+	vl);
+}
+
+PDF_SEPARABLE_BLEND_MODE (hard_light)
+
+static force_inline vuint32m8_t
+rvv_blend_difference_int (const vuint8m2_t d,
+			  const vuint8m2_t ad,
+			  const vuint8m2_t s,
+			  const vuint8m2_t as,
+			  size_t           vl)
+{
+    vuint16m4_t das = __riscv_vwmulu (d, as, vl);
+    vuint16m4_t sad = __riscv_vwmulu (s, ad, vl);
+
+    return __riscv_vmerge (__riscv_vwsubu_vv (sad, das, vl),
+			   __riscv_vwsubu_vv (das, sad, vl),
+			   __riscv_vmsltu (sad, das, vl), vl);
+}
+
+PDF_SEPARABLE_BLEND_MODE (difference)
+
+static force_inline vuint32m8_t
+rvv_blend_exclusion_int (const vuint8m2_t d,
+			 const vuint8m2_t ad,
+			 const vuint8m2_t s,
+			 const vuint8m2_t as,
+			 size_t           vl)
+{
+    return __riscv_vsub (__riscv_vwaddu_vv (__riscv_vwmulu (s, ad, vl),
+					    __riscv_vwmulu (d, as, vl), vl),
+			 __riscv_vwmulu (__riscv_vwmulu (d, s, vl), 2, vl), vl);
+}
+
+PDF_SEPARABLE_BLEND_MODE (exclusion)
+
+#undef PDF_SEPARABLE_BLEND_MODE
+
+static void
+rvv_combine_over_ca (pixman_implementation_t *__restrict__ imp,
+		     pixman_op_t op,
+		     uint32_t *__restrict__ dest,
+		     const uint32_t *__restrict__ src,
+		     const uint32_t *__restrict__ mask,
+		     int width)
+{
+    uint32_t *__restrict__ pd       = dest;
+    const uint32_t *__restrict__ ps = src;
+    const uint32_t *__restrict__ pm = mask;
+
+    vuint32m4_t s, m;
+
+    RVV_FOREACH_3 (width, vl, e32m4, ps, pm, pd)
+    {
+	s = __riscv_vle32_v_u32m4 (ps, vl);
+	m = __riscv_vle32_v_u32m4 (pm, vl);
+	rvv_combine_mask_ca_m4 (&s, &m, vl);
+
+	__riscv_vse32 (
+	    pd,
+	    rvv_UN8x4_MUL_UN8x4_ADD_UN8x4_vvv_m4 (
+		__riscv_vle32_v_u32m4 (pd, vl), __riscv_vnot (m, vl), s, vl),
+	    vl);
+    }
+}
+
+static void
+rvv_combine_over_reverse_ca (pixman_implementation_t *__restrict__ imp,
+			     pixman_op_t op,
+			     uint32_t *__restrict__ dest,
+			     const uint32_t *__restrict__ src,
+			     const uint32_t *__restrict__ mask,
+			     int width)
+{
+    uint32_t *__restrict__ pd       = dest;
+    const uint32_t *__restrict__ ps = src;
+    const uint32_t *__restrict__ pm = mask;
+
+    vuint32m4_t d;
+
+    RVV_FOREACH_3 (width, vl, e32m4, ps, pm, pd)
+    {
+	d = __riscv_vle32_v_u32m4 (pd, vl);
+	__riscv_vse32 (
+	    pd,
+	    rvv_UN8x4_MUL_UN16_ADD_UN8x4_vvv_m4 (
+		rvv_UN8x4_MUL_UN8x4_vv_m4 (__riscv_vle32_v_u32m4 (ps, vl),
+					   __riscv_vle32_v_u32m4 (pm, vl), vl),
+		rvv_shift_not_alpha_u16 (d, vl), d, vl),
+	    vl);
+    }
+}
+
+static void
+rvv_combine_atop_ca (pixman_implementation_t *__restrict__ imp,
+		     pixman_op_t op,
+		     uint32_t *__restrict__ dest,
+		     const uint32_t *__restrict__ src,
+		     const uint32_t *__restrict__ mask,
+		     int width)
+{
+    uint32_t *__restrict__ pd       = dest;
+    const uint32_t *__restrict__ ps = src;
+    const uint32_t *__restrict__ pm = mask;
+
+    vuint32m4_t d, s, m;
+
+    RVV_FOREACH_3 (width, vl, e32m4, ps, pm, pd)
+    {
+	s = __riscv_vle32_v_u32m4 (ps, vl);
+	m = __riscv_vle32_v_u32m4 (pm, vl);
+	rvv_combine_mask_ca_m4 (&s, &m, vl);
+
+	d = __riscv_vle32_v_u32m4 (pd, vl);
+	__riscv_vse32 (
+	    pd,
+	    rvv_UN8x4_MUL_UN8x4_ADD_UN8x4_MUL_UN16_vvvv_m4 (
+		d, __riscv_vnot (m, vl), s, rvv_shift_alpha_u16 (d, vl), vl),
+	    vl);
+    }
+}
+
+static void
+rvv_combine_xor_ca (pixman_implementation_t *__restrict__ imp,
+		    pixman_op_t op,
+		    uint32_t *__restrict__ dest,
+		    const uint32_t *__restrict__ src,
+		    const uint32_t *__restrict__ mask,
+		    int width)
+{
+    uint32_t *__restrict__ pd       = dest;
+    const uint32_t *__restrict__ ps = src;
+    const uint32_t *__restrict__ pm = mask;
+
+    vuint32m4_t d, s, m;
+
+    RVV_FOREACH_3 (width, vl, e32m4, ps, pm, pd)
+    {
+	s = __riscv_vle32_v_u32m4 (ps, vl);
+	m = __riscv_vle32_v_u32m4 (pm, vl);
+	rvv_combine_mask_ca_m4 (&s, &m, vl);
+
+	d = __riscv_vle32_v_u32m4 (pd, vl);
+	__riscv_vse32 (pd,
+		       rvv_UN8x4_MUL_UN8x4_ADD_UN8x4_MUL_UN16_vvvv_m4 (
+			   d, __riscv_vnot (m, vl), s,
+			   rvv_shift_not_alpha_u16 (d, vl), vl),
+		       vl);
+    }
+}
+
+static void
+rvv_combine_atop_reverse_ca (pixman_implementation_t *__restrict__ imp,
+			     pixman_op_t op,
+			     uint32_t *__restrict__ dest,
+			     const uint32_t *__restrict__ src,
+			     const uint32_t *__restrict__ mask,
+			     int width)
+{
+    uint32_t *__restrict__ pd       = dest;
+    const uint32_t *__restrict__ ps = src;
+    const uint32_t *__restrict__ pm = mask;
+
+    vuint32m4_t d, s, m;
+
+    RVV_FOREACH_3 (width, vl, e32m4, ps, pm, pd)
+    {
+	s = __riscv_vle32_v_u32m4 (ps, vl);
+	m = __riscv_vle32_v_u32m4 (pm, vl);
+	rvv_combine_mask_ca_m4 (&s, &m, vl);
+
+	d = __riscv_vle32_v_u32m4 (pd, vl);
+	__riscv_vse32 (pd,
+		       rvv_UN8x4_MUL_UN8x4_ADD_UN8x4_MUL_UN16_vvvv_m4 (
+			   d, m, s, rvv_shift_not_alpha_u16 (d, vl), vl),
+		       vl);
+    }
+}
+
+static void
+rvv_combine_src_ca (pixman_implementation_t *__restrict__ imp,
+		    pixman_op_t op,
+		    uint32_t *__restrict__ dest,
+		    const uint32_t *__restrict__ src,
+		    const uint32_t *__restrict__ mask,
+		    int width)
+{
+    uint32_t *__restrict__ pd       = dest;
+    const uint32_t *__restrict__ ps = src;
+    const uint32_t *__restrict__ pm = mask;
+
+    RVV_FOREACH_3 (width, vl, e32m4, ps, pm, pd)
+    {
+	__riscv_vse32 (
+	    pd,
+	    rvv_combine_mask_value_ca_m4 (__riscv_vle32_v_u32m4 (ps, vl),
+					  __riscv_vle32_v_u32m4 (pm, vl), vl),
+	    vl);
+    }
+}
+
+static void
+rvv_combine_in_ca (pixman_implementation_t *__restrict__ imp,
+		   pixman_op_t op,
+		   uint32_t *__restrict__ dest,
+		   const uint32_t *__restrict__ src,
+		   const uint32_t *__restrict__ mask,
+		   int width)
+{
+    uint32_t *__restrict__ pd       = dest;
+    const uint32_t *__restrict__ ps = src;
+    const uint32_t *__restrict__ pm = mask;
+
+    RVV_FOREACH_3 (width, vl, e32m4, ps, pm, pd)
+    {
+	__riscv_vse32 (pd,
+		       rvv_in_m4 (rvv_combine_mask_value_ca_m4 (
+				      __riscv_vle32_v_u32m4 (ps, vl),
+				      __riscv_vle32_v_u32m4 (pm, vl), vl),
+				  rvv_load_alpha_u8m1 (pd, vl), vl),
+		       vl);
+    }
+}
+
+static void
+rvv_combine_in_reverse_ca (pixman_implementation_t *imp,
+			   pixman_op_t              op,
+			   uint32_t                *dest,
+			   const uint32_t          *src,
+			   const uint32_t          *mask,
+			   int                      width)
+{
+    uint32_t *__restrict__ pd       = dest;
+    const uint32_t *__restrict__ ps = src;
+    const uint32_t *__restrict__ pm = mask;
+
+    RVV_FOREACH_3 (width, vl, e32m4, ps, pm, pd)
+    {
+	__riscv_vse32 (
+	    pd,
+	    rvv_UN8x4_MUL_UN8x4_vv_m4 (__riscv_vle32_v_u32m4 (pd, vl),
+				       rvv_combine_mask_alpha_ca_m4 (
+					   __riscv_vle32_v_u32m4 (ps, vl),
+					   __riscv_vle32_v_u32m4 (pm, vl), vl),
+				       vl),
+	    vl);
+    }
+}
+
+static void
+rvv_combine_out_ca (pixman_implementation_t *__restrict__ imp,
+		    pixman_op_t op,
+		    uint32_t *__restrict__ dest,
+		    const uint32_t *__restrict__ src,
+		    const uint32_t *__restrict__ mask,
+		    int width)
+{
+    uint32_t *__restrict__ pd       = dest;
+    const uint32_t *__restrict__ ps = src;
+    const uint32_t *__restrict__ pm = mask;
+
+    RVV_FOREACH_3 (width, vl, e32m4, ps, pm, pd)
+    {
+	__riscv_vse32 (pd,
+		       rvv_in_m4 (rvv_combine_mask_value_ca_m4 (
+				      __riscv_vle32_v_u32m4 (ps, vl),
+				      __riscv_vle32_v_u32m4 (pm, vl), vl),
+				  rvv_load_not_alpha_u8m1 (pd, vl), vl),
+		       vl);
+    }
+}
+
+static void
+rvv_combine_out_reverse_ca (pixman_implementation_t *imp,
+			    pixman_op_t              op,
+			    uint32_t                *dest,
+			    const uint32_t          *src,
+			    const uint32_t          *mask,
+			    int                      width)
+{
+    uint32_t *__restrict__ pd       = dest;
+    const uint32_t *__restrict__ ps = src;
+    const uint32_t *__restrict__ pm = mask;
+
+    RVV_FOREACH_3 (width, vl, e32m4, ps, pm, pd)
+    {
+	__riscv_vse32 (
+	    pd,
+	    rvv_UN8x4_MUL_UN8x4_vv_m4 (
+		__riscv_vle32_v_u32m4 (pd, vl),
+		__riscv_vnot_v_u32m4 (rvv_combine_mask_alpha_ca_m4 (
+					  __riscv_vle32_v_u32m4 (ps, vl),
+					  __riscv_vle32_v_u32m4 (pm, vl), vl),
+				      vl),
+		vl),
+	    vl);
+    }
+}
+
+static void
+rvv_combine_add_ca (pixman_implementation_t *__restrict__ imp,
+		    pixman_op_t op,
+		    uint32_t *__restrict__ dest,
+		    const uint32_t *__restrict__ src,
+		    const uint32_t *__restrict__ mask,
+		    int width)
+{
+    uint32_t *__restrict__ pd       = dest;
+    const uint32_t *__restrict__ ps = src;
+    const uint32_t *__restrict__ pm = mask;
+
+    RVV_FOREACH_3 (width, vl, e32m4, ps, pm, pd)
+    {
+	__riscv_vse32 (
+	    pd,
+	    rvv_UN8x4_ADD_UN8x4_vv_m4 (__riscv_vle32_v_u32m4 (pd, vl),
+				       rvv_combine_mask_value_ca_m4 (
+					   __riscv_vle32_v_u32m4 (ps, vl),
+					   __riscv_vle32_v_u32m4 (pm, vl), vl),
+				       vl),
+	    vl);
+    }
+}
+
+static void
+rvv_composite_src_x888_8888 (pixman_implementation_t *__restrict__ imp,
+			     pixman_composite_info_t *__restrict__ info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t *__restrict__ dst_line, *__restrict__ dst;
+    uint32_t *__restrict__ src_line, *__restrict__ src;
+    int32_t dst_stride, src_stride;
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride,
+			   dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride,
+			   src_line, 1);
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	src = src_line;
+	src_line += src_stride;
+
+	RVV_FOREACH_2 (width, vl, e32m8, src, dst)
+	{
+	    __riscv_vse32 (
+		dst,
+		__riscv_vor (__riscv_vle32_v_u32m8 (src, vl), 0xff000000, vl),
+		vl);
+	}
+    }
+}
+
+static void
+rvv_composite_src_8888_8888 (pixman_implementation_t *__restrict__ imp,
+			     pixman_composite_info_t *__restrict__ info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t *__restrict__ dst_line, *__restrict__ dst;
+    uint32_t *__restrict__ src_line, *__restrict__ src;
+    int32_t dst_stride, src_stride;
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride,
+			   dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride,
+			   src_line, 1);
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	src = src_line;
+	src_line += src_stride;
+
+	RVV_FOREACH_2 (width, vl, e32m8, src, dst)
+	{
+	    __riscv_vse32 (dst, __riscv_vle32_v_u32m8 (src, vl), vl);
+	}
+    }
+}
+
+static void
+rvv_composite_over_x888_8_8888 (pixman_implementation_t *__restrict__ imp,
+				pixman_composite_info_t *__restrict__ info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t *__restrict__ src, *__restrict__ src_line;
+    uint32_t *__restrict__ dst, *__restrict__ dst_line;
+    uint8_t *__restrict__ mask, *__restrict__ mask_line;
+    int32_t src_stride, mask_stride, dst_stride;
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride,
+			   dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride,
+			   mask_line, 1);
+    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride,
+			   src_line, 1);
+
+    while (height--)
+    {
+	src = src_line;
+	src_line += src_stride;
+	dst = dst_line;
+	dst_line += dst_stride;
+	mask = mask_line;
+	mask_line += mask_stride;
+
+	RVV_FOREACH_3 (width, vl, e32m4, src, mask, dst)
+	{
+	    __riscv_vse32 (
+		dst,
+		rvv_over_m4 (
+		    rvv_in_m4 (__riscv_vor (__riscv_vle32_v_u32m4 (src, vl),
+					    0xff000000, vl),
+			       __riscv_vle8_v_u8m1 (mask, vl), vl),
+		    __riscv_vle32_v_u32m4 (dst, vl), vl),
+		vl);
+	}
+    }
+}
+
+static void
+rvv_composite_over_8888_8888 (pixman_implementation_t *imp,
+			      pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t *dst_line, *dst;
+    uint32_t *src_line, *src;
+    int       dst_stride, src_stride;
+
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride,
+			   dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride,
+			   src_line, 1);
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	src = src_line;
+	src_line += src_stride;
+
+	RVV_FOREACH_2 (width, vl, e32m4, src, dst)
+	{
+	    __riscv_vse32 (dst,
+			   rvv_over_m4 (__riscv_vle32_v_u32m4 (src, vl),
+					__riscv_vle32_v_u32m4 (dst, vl), vl),
+			   vl);
+	}
+    }
+}
+
+static void
+rvv_composite_over_n_8_0565 (pixman_implementation_t *imp,
+			     pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint16_t *__restrict__ dst_line, *__restrict__ dst;
+    uint8_t *__restrict__ mask_line, *__restrict__ mask;
+    int         dst_stride, mask_stride;
+    uint32_t    src;
+    vuint32m4_t vsrc;
+
+    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
+    if (src == 0)
+	return;
+    vsrc = __riscv_vmv_v_x_u32m4 (src, __riscv_vsetvlmax_e32m4 ());
+
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride,
+			   dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride,
+			   mask_line, 1);
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	mask = mask_line;
+	mask_line += mask_stride;
+
+	RVV_FOREACH_2 (width, vl, e16m2, mask, dst)
+	{
+	    __riscv_vse16 (
+		dst,
+		rvv_convert_8888_to_0565_m2 (
+		    rvv_over_m4 (
+			rvv_in_m4 (vsrc, __riscv_vle8_v_u8m1 (mask, vl), vl),
+			rvv_convert_0565_to_0888_m4 (
+			    __riscv_vle16_v_u16m2 (dst, vl), vl),
+			vl),
+		    vl),
+		vl);
+	}
+    }
+}
+
+static void
+rvv_composite_over_n_8_8888 (pixman_implementation_t *imp,
+			     pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t   *dst_line, *dst;
+    uint8_t    *mask_line, *mask;
+    int         dst_stride, mask_stride;
+    uint32_t    src;
+    vuint32m4_t vsrc;
+
+    src  = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
+    vsrc = __riscv_vmv_v_x_u32m4 (src, __riscv_vsetvlmax_e32m4 ());
+
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride,
+			   dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride,
+			   mask_line, 1);
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	mask = mask_line;
+	mask_line += mask_stride;
+
+	RVV_FOREACH_2 (width, vl, e32m4, mask, dst)
+	{
+	    __riscv_vse32 (
+		dst,
+		rvv_over_m4 (
+		    rvv_in_m4 (vsrc, __riscv_vle8_v_u8m1 (mask, vl), vl),
+		    __riscv_vle32_v_u32m4 (dst, vl), vl),
+		vl);
+	}
+    }
+}
+
+static void
+rvv_composite_add_n_8888_8888_ca (pixman_implementation_t *imp,
+				  pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t   *dst_line, *dst;
+    uint32_t   *mask_line, *mask;
+    int         dst_stride, mask_stride;
+    uint32_t    src;
+    vuint32m4_t vsrc;
+
+    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
+    if (src == 0)
+	return;
+    vsrc = __riscv_vmv_v_x_u32m4 (src, __riscv_vsetvlmax_e32m4 ());
+
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride,
+			   dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint32_t, mask_stride,
+			   mask_line, 1);
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	mask = mask_line;
+	mask_line += mask_stride;
+
+	RVV_FOREACH_2 (width, vl, e32m4, mask, dst)
+	{
+	    __riscv_vse32 (dst,
+			   rvv_UN8x4_MUL_UN8x4_ADD_UN8x4_vvv_m4 (
+			       __riscv_vle32_v_u32m4 (mask, vl), vsrc,
+			       __riscv_vle32_v_u32m4 (dst, vl), vl),
+			   vl);
+	}
+    }
+}
+
+static void
+rvv_composite_over_n_8888_8888_ca (pixman_implementation_t *__restrict__ imp,
+				   pixman_composite_info_t *__restrict__ info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t *__restrict__ dst_line, *__restrict__ dst;
+    uint32_t *__restrict__ mask_line, *__restrict__ mask;
+    int         dst_stride, mask_stride;
+    uint32_t    src, srca;
+    vuint32m4_t vsrc;
+
+    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
+    if (src == 0)
+	return;
+    srca = src >> 24;
+    vsrc = __riscv_vmv_v_x_u32m4 (src, __riscv_vsetvlmax_e32m4 ());
+
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride,
+			   dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint32_t, mask_stride,
+			   mask_line, 1);
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	mask = mask_line;
+	mask_line += mask_stride;
+
+	RVV_FOREACH_2 (width, vl, e32m4, mask, dst)
+	{
+	    vuint32m4_t m = __riscv_vle32_v_u32m4 (mask, vl);
+	    __riscv_vse32 (
+		dst,
+		rvv_UN8x4_MUL_UN8x4_ADD_UN8x4_vvv_m4 (
+		    __riscv_vle32_v_u32m4 (dst, vl),
+		    __riscv_vnot (rvv_UN8x4_MUL_UN8_vx_m4 (m, srca, vl), vl),
+		    rvv_UN8x4_MUL_UN8x4_vv_m4 (m, vsrc, vl), vl),
+		vl);
+	}
+    }
+}
+
+static void
+rvv_composite_over_n_8888_0565_ca (pixman_implementation_t *__restrict__ imp,
+				   pixman_composite_info_t *__restrict__ info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint16_t *__restrict__ dst_line, *__restrict__ dst;
+    uint32_t *__restrict__ mask_line, *__restrict__ mask;
+    int         dst_stride, mask_stride;
+    uint32_t    src, srca;
+    vuint32m4_t vsrc;
+
+    src  = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
+    srca = src >> 24;
+    if (src == 0)
+	return;
+    vsrc = __riscv_vmv_v_x_u32m4 (src, __riscv_vsetvlmax_e32m4 ());
+
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride,
+			   dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint32_t, mask_stride,
+			   mask_line, 1);
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	mask = mask_line;
+	mask_line += mask_stride;
+
+	RVV_FOREACH_2 (width, vl, e32m4, mask, dst)
+	{
+	    vuint32m4_t ma = __riscv_vle32_v_u32m4 (mask, vl);
+
+	    __riscv_vse16 (
+		dst,
+		rvv_convert_8888_to_0565_m2 (
+		    rvv_UN8x4_MUL_UN8x4_ADD_UN8x4_vvv_m4 (
+			rvv_convert_0565_to_0888_m4 (
+			    __riscv_vle16_v_u16m2 (dst, vl), vl),
+			__riscv_vnot (rvv_UN8x4_MUL_UN8_vx_m4 (ma, srca, vl),
+				      vl),
+			rvv_UN8x4_MUL_UN8x4_vv_m4 (ma, vsrc, vl), vl),
+		    vl),
+		vl);
+	}
+    }
+}
+
+static void
+rvv_composite_over_8888_0565 (pixman_implementation_t *__restrict__ imp,
+			      pixman_composite_info_t *__restrict__ info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint16_t *__restrict__ dst_line, *__restrict__ dst;
+    uint32_t *__restrict__ src_line, *__restrict__ src;
+    int dst_stride, src_stride;
+
+    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride,
+			   src_line, 1);
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride,
+			   dst_line, 1);
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	src = src_line;
+	src_line += src_stride;
+
+	RVV_FOREACH_2 (width, vl, e16m2, src, dst)
+	{
+	    __riscv_vse16 (
+		dst,
+		rvv_convert_8888_to_0565_m2 (
+		    rvv_over_m4 (__riscv_vle32_v_u32m4 (src, vl),
+				 rvv_convert_0565_to_0888_m4 (
+				     __riscv_vle16_v_u16m2 (dst, vl), vl),
+				 vl),
+		    vl),
+		vl);
+	}
+    }
+}
+
+static void
+rvv_composite_add_8_8 (pixman_implementation_t *imp,
+		       pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint8_t *dst_line, *dst;
+    uint8_t *src_line, *src;
+    int      dst_stride, src_stride;
+
+    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint8_t, src_stride,
+			   src_line, 1);
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride,
+			   dst_line, 1);
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	src = src_line;
+	src_line += src_stride;
+
+	RVV_FOREACH_2 (width, vl, e8m8, src, dst)
+	{
+	    __riscv_vse8 (dst,
+			  rvv_UN8_ADD_UN8_vv (__riscv_vle8_v_u8m8 (src, vl),
+					      __riscv_vle8_v_u8m8 (dst, vl),
+					      vl),
+			  vl);
+	}
+    }
+}
+
+static void
+rvv_composite_add_0565_0565 (pixman_implementation_t *imp,
+			     pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint16_t *dst_line, *dst;
+    uint16_t *src_line, *src;
+    int       dst_stride, src_stride;
+
+    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint16_t, src_stride,
+			   src_line, 1);
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride,
+			   dst_line, 1);
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	src = src_line;
+	src_line += src_stride;
+
+	RVV_FOREACH_2 (width, vl, e16m2, src, dst)
+	{
+	    __riscv_vse16 (dst,
+			   rvv_convert_8888_to_0565_m2 (
+			       rvv_UN8x4_ADD_UN8x4_vv_m4 (
+				   rvv_convert_0565_to_8888_m4 (
+				       __riscv_vle16_v_u16m2 (src, vl), vl),
+				   rvv_convert_0565_to_8888_m4 (
+				       __riscv_vle16_v_u16m2 (dst, vl), vl),
+				   vl),
+			       vl),
+			   vl);
+	}
+    }
+}
+
+static void
+rvv_composite_add_8888_8888 (pixman_implementation_t *__restrict__ imp,
+			     pixman_composite_info_t *__restrict__ info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t *__restrict__ dst_line, *__restrict__ dst;
+    uint32_t *__restrict__ src_line, *__restrict__ src;
+    int dst_stride, src_stride;
+
+    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride,
+			   src_line, 1);
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride,
+			   dst_line, 1);
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	src = src_line;
+	src_line += src_stride;
+
+	RVV_FOREACH_2 (width, vl, e32m4, src, dst)
+	{
+	    __riscv_vse32 (
+		dst,
+		rvv_UN8x4_ADD_UN8x4_vv_m4 (__riscv_vle32_v_u32m4 (src, vl),
+					   __riscv_vle32_v_u32m4 (dst, vl), vl),
+		vl);
+	}
+    }
+}
+
+static void
+rvv_composite_add_n_8_8 (pixman_implementation_t *imp,
+			 pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint8_t *dst_line, *dst;
+    uint8_t *mask_line, *mask;
+    int      dst_stride, mask_stride;
+    uint32_t src;
+    uint8_t  sa;
+
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride,
+			   dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride,
+			   mask_line, 1);
+    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
+    sa  = (src >> 24);
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	mask = mask_line;
+	mask_line += mask_stride;
+
+	RVV_FOREACH_2 (width, vl, e8m4, mask, dst)
+	{
+	    __riscv_vse8 (
+		dst,
+		rvv_UN8_ADD_UN8_vv (rvv_UN8_MUL_UN8_vx_m4 (
+					__riscv_vle8_v_u8m4 (mask, vl), sa, vl),
+				    __riscv_vle8_v_u8m4 (dst, vl), vl),
+		vl);
+	}
+    }
+}
+
+static void
+rvv_composite_src_memcpy (pixman_implementation_t *imp,
+			  pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    int      bpp     = PIXMAN_FORMAT_BPP (dest_image->bits.format) / 8;
+    uint32_t n_bytes = width * bpp;
+    int      dst_stride, src_stride;
+    uint8_t *dst;
+    uint8_t *src;
+
+    src_stride = src_image->bits.rowstride * 4;
+    dst_stride = dest_image->bits.rowstride * 4;
+
+    src = (uint8_t *)src_image->bits.bits + src_y * src_stride + src_x * bpp;
+    dst = (uint8_t *)dest_image->bits.bits + dest_y * dst_stride + dest_x * bpp;
+
+    while (height--)
+    {
+	memcpy (dst, src, n_bytes);
+
+	dst += dst_stride;
+	src += src_stride;
+    }
+}
+
+static void
+rvv_composite_in_n_8_8 (pixman_implementation_t *imp,
+			pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t src, srca;
+    uint8_t *dst_line, *dst;
+    uint8_t *mask_line, *mask;
+    int      dst_stride, mask_stride;
+
+    src  = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
+    srca = src >> 24;
+
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride,
+			   dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride,
+			   mask_line, 1);
+
+    if (srca == 0xff)
+    {
+	while (height--)
+	{
+	    dst = dst_line;
+	    dst_line += dst_stride;
+	    mask = mask_line;
+	    mask_line += mask_stride;
+
+	    RVV_FOREACH_2 (width, vl, e8m4, mask, dst)
+	    {
+		__riscv_vse8 (
+		    dst,
+		    rvv_UN8_MUL_UN8_vv_m4 (__riscv_vle8_v_u8m4 (mask, vl),
+					   __riscv_vle8_v_u8m4 (dst, vl), vl),
+		    vl);
+	    }
+	}
+    }
+    else
+    {
+	while (height--)
+	{
+	    dst = dst_line;
+	    dst_line += dst_stride;
+	    mask = mask_line;
+	    mask_line += mask_stride;
+
+	    RVV_FOREACH_2 (width, vl, e8m4, mask, dst)
+	    {
+		__riscv_vse8 (dst,
+			      rvv_UN8_MUL_UN8_vv_m4 (
+				  rvv_UN8_MUL_UN8_vx_m4 (
+				      __riscv_vle8_v_u8m4 (mask, vl), srca, vl),
+				  __riscv_vle8_v_u8m4 (dst, vl), vl),
+			      vl);
+	    }
+	}
+    }
+}
+
+static void
+rvv_composite_in_8_8 (pixman_implementation_t *imp,
+		      pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint8_t *dst_line, *dst;
+    uint8_t *src_line, *src;
+    int      dst_stride, src_stride;
+
+    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint8_t, src_stride,
+			   src_line, 1);
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride,
+			   dst_line, 1);
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	src = src_line;
+	src_line += src_stride;
+
+	RVV_FOREACH_2 (width, vl, e8m4, src, dst)
+	{
+	    __riscv_vse8 (dst,
+			  rvv_UN8_MUL_UN8_vv_m4 (__riscv_vle8_v_u8m4 (src, vl),
+						 __riscv_vle8_v_u8m4 (dst, vl),
+						 vl),
+			  vl);
+	}
+    }
+}
+
+#define A1_FILL_MASK(n, offs) (((1U << (n)) - 1) << (offs))
+
+/*
+ * There is some potential for hand vectorization, but for now let's leave it
+ * autovectorized.
+ */
+static force_inline void
+pixman_fill1_line (uint32_t *dst, int offs, int width, int v)
+{
+    if (offs)
+    {
+	int leading_pixels = 32 - offs;
+	if (leading_pixels >= width)
+	{
+	    if (v)
+		*dst |= A1_FILL_MASK (width, offs);
+	    else
+		*dst &= ~A1_FILL_MASK (width, offs);
+	    return;
+	}
+	else
+	{
+	    if (v)
+		*dst++ |= A1_FILL_MASK (leading_pixels, offs);
+	    else
+		*dst++ &= ~A1_FILL_MASK (leading_pixels, offs);
+	    width -= leading_pixels;
+	}
+    }
+    while (width >= 32)
+    {
+	if (v)
+	    *dst++ = 0xFFFFFFFF;
+	else
+	    *dst++ = 0;
+	width -= 32;
+    }
+    if (width > 0)
+    {
+	if (v)
+	    *dst |= A1_FILL_MASK (width, 0);
+	else
+	    *dst &= ~A1_FILL_MASK (width, 0);
+    }
+}
+
+static void
+rvv_fill1 (uint32_t *bits,
+	   int       stride,
+	   int       x,
+	   int       y,
+	   int       width,
+	   int       height,
+	   uint32_t  filler)
+{
+    uint32_t *dst  = bits + y * stride + (x >> 5);
+    int       offs = x & 31;
+
+    while (height--)
+    {
+	pixman_fill1_line (dst, offs, width, (filler & 1));
+	dst += stride;
+    }
+}
+
+#define RVV_FILL(dtypew)                                                            \
+    static void rvv_fill_u##dtypew (uint32_t *__restrict__ bits, int stride,        \
+				    int x, int y, int width, int height,            \
+				    uint32_t filler)                                \
+    {                                                                               \
+	uint##dtypew##_t *__restrict__ bitsw = (uint##dtypew##_t *)bits;            \
+	int32_t             vstride          = stride * (32 / dtypew);              \
+	vuint##dtypew##m8_t vfiller          = __riscv_vmv_v_x_u##dtypew##m8 (      \
+            (uint##dtypew##_t)filler, __riscv_vsetvlmax_e##dtypew##m8 ()); \
+                                                                                    \
+	bitsw += y * vstride + x;                                                   \
+	while (height--)                                                            \
+	{                                                                           \
+	    uint##dtypew##_t *__restrict__ d = bitsw;                               \
+                                                                                    \
+	    RVV_FOREACH_1 (width, vl, e##dtypew##m8, d)                             \
+	    {                                                                       \
+		__riscv_vse##dtypew (d, vfiller, vl);                               \
+	    }                                                                       \
+                                                                                    \
+	    bitsw += vstride;                                                       \
+	}                                                                           \
+    }
+
+RVV_FILL (8);
+RVV_FILL (16);
+RVV_FILL (32);
+
+static pixman_bool_t
+rvv_fill (pixman_implementation_t *__restrict__ imp,
+	  uint32_t *__restrict__ bits,
+	  int      stride,
+	  int      bpp,
+	  int      x,
+	  int      y,
+	  int      width,
+	  int      height,
+	  uint32_t filler)
+{
+    switch (bpp)
+    {
+	case 1:
+	    rvv_fill1 (bits, stride, x, y, width, height, filler);
+	    break;
+	case 8:
+	    rvv_fill_u8 (bits, stride, x, y, width, height, filler);
+	    break;
+	case 16:
+	    rvv_fill_u16 (bits, stride, x, y, width, height, filler);
+	    break;
+	case 32:
+	    rvv_fill_u32 (bits, stride, x, y, width, height, filler);
+	    break;
+	default:
+	    return FALSE;
+    }
+
+    return TRUE;
+}
+
+static void
+rvv_composite_solid_fill (pixman_implementation_t *imp,
+			  pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t src;
+
+    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
+
+    if (dest_image->bits.format == PIXMAN_a1)
+    {
+	src = src >> 31;
+    }
+    else if (dest_image->bits.format == PIXMAN_a8)
+    {
+	src = src >> 24;
+    }
+    else if (dest_image->bits.format == PIXMAN_r5g6b5 ||
+	     dest_image->bits.format == PIXMAN_b5g6r5)
+    {
+	src = convert_8888_to_0565 (src);
+    }
+
+    rvv_fill (imp, dest_image->bits.bits, dest_image->bits.rowstride,
+	      PIXMAN_FORMAT_BPP (dest_image->bits.format), dest_x, dest_y,
+	      width, height, src);
+}
+
+#define RVV_BLT(dtypew)                                                        \
+    static void rvv_blt_u##dtypew (                                            \
+	uint32_t *__restrict__ src_bits, uint32_t *__restrict__ dst_bits,      \
+	int src_stride, int dst_stride, int src_x, int src_y, int dest_x,      \
+	int dest_y, int width, int height)                                     \
+    {                                                                          \
+	uint##dtypew##_t *src_w = (uint##dtypew##_t *)src_bits;                \
+	uint##dtypew##_t *dst_w = (uint##dtypew##_t *)dst_bits;                \
+                                                                               \
+	src_stride = src_stride * (32 / dtypew);                               \
+	dst_stride = dst_stride * (32 / dtypew);                               \
+                                                                               \
+	src_w += src_stride * src_y + src_x;                                   \
+	dst_w += dst_stride * dest_y + dest_x;                                 \
+                                                                               \
+	while (height--)                                                       \
+	{                                                                      \
+	    uint##dtypew##_t *__restrict__ pd = dst_w;                         \
+	    uint##dtypew##_t *__restrict__ ps = src_w;                         \
+                                                                               \
+	    RVV_FOREACH_2 (width, vl, e##dtypew##m8, ps, pd)                   \
+	    {                                                                  \
+		__riscv_vse##dtypew (                                          \
+		    pd, __riscv_vle##dtypew##_v_u##dtypew##m8 (ps, vl), vl);   \
+	    }                                                                  \
+                                                                               \
+	    dst_w += dst_stride;                                               \
+	    src_w += src_stride;                                               \
+	}                                                                      \
+    }
+RVV_BLT (8);
+RVV_BLT (16);
+RVV_BLT (32);
+
+static pixman_bool_t
+rvv_blt (pixman_implementation_t *__restrict__ imp,
+	 uint32_t *__restrict__ src_bits,
+	 uint32_t *__restrict__ dst_bits,
+	 int src_stride,
+	 int dst_stride,
+	 int src_bpp,
+	 int dst_bpp,
+	 int src_x,
+	 int src_y,
+	 int dest_x,
+	 int dest_y,
+	 int width,
+	 int height)
+{
+    if (src_bpp != dst_bpp)
+	return FALSE;
+
+    switch (src_bpp)
+    {
+	case 8:
+	    rvv_blt_u8 (src_bits, dst_bits, src_stride, dst_stride, src_x,
+			src_y, dest_x, dest_y, width, height);
+	    break;
+	case 16:
+	    rvv_blt_u16 (src_bits, dst_bits, src_stride, dst_stride, src_x,
+			 src_y, dest_x, dest_y, width, height);
+	    break;
+	case 32:
+	    rvv_blt_u32 (src_bits, dst_bits, src_stride, dst_stride, src_x,
+			 src_y, dest_x, dest_y, width, height);
+	    break;
+	default:
+	    return FALSE;
+    }
+
+    return TRUE;
+}
+
+// clang-format off
+static const pixman_fast_path_t rvv_fast_paths[] = {
+    PIXMAN_STD_FAST_PATH (OVER, solid, a8, r5g6b5, rvv_composite_over_n_8_0565),
+    PIXMAN_STD_FAST_PATH (OVER, solid, a8, b5g6r5, rvv_composite_over_n_8_0565),
+    // PIXMAN_STD_FAST_PATH (OVER, solid, a8, r8g8b8, rvv_composite_over_n_8_0888),
+    // PIXMAN_STD_FAST_PATH (OVER, solid, a8, b8g8r8, rvv_composite_over_n_8_0888),
+    PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8r8g8b8, rvv_composite_over_n_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8r8g8b8, rvv_composite_over_n_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8b8g8r8, rvv_composite_over_n_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8b8g8r8, rvv_composite_over_n_8_8888),
+    // PIXMAN_STD_FAST_PATH (OVER, solid, a1, a8r8g8b8, rvv_composite_over_n_1_8888),
+    // PIXMAN_STD_FAST_PATH (OVER, solid, a1, x8r8g8b8, rvv_composite_over_n_1_8888),
+    // PIXMAN_STD_FAST_PATH (OVER, solid, a1, a8b8g8r8, rvv_composite_over_n_1_8888),
+    // PIXMAN_STD_FAST_PATH (OVER, solid, a1, x8b8g8r8, rvv_composite_over_n_1_8888),
+    // PIXMAN_STD_FAST_PATH (OVER, solid, a1, r5g6b5,   rvv_composite_over_n_1_0565),
+    // PIXMAN_STD_FAST_PATH (OVER, solid, a1, b5g6r5,   rvv_composite_over_n_1_0565),
+    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, a8r8g8b8, rvv_composite_over_n_8888_8888_ca),
+    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, x8r8g8b8, rvv_composite_over_n_8888_8888_ca),
+    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, r5g6b5, rvv_composite_over_n_8888_0565_ca),
+    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, a8b8g8r8, rvv_composite_over_n_8888_8888_ca),
+    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, x8b8g8r8, rvv_composite_over_n_8888_8888_ca),
+    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, b5g6r5, rvv_composite_over_n_8888_0565_ca),
+    PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, x8r8g8b8, rvv_composite_over_x888_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, a8r8g8b8, rvv_composite_over_x888_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, x8b8g8r8, rvv_composite_over_x888_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, a8b8g8r8, rvv_composite_over_x888_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, a8r8g8b8, rvv_composite_over_8888_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, x8r8g8b8, rvv_composite_over_8888_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, r5g6b5, rvv_composite_over_8888_0565),
+    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, a8b8g8r8, rvv_composite_over_8888_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, x8b8g8r8, rvv_composite_over_8888_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, b5g6r5, rvv_composite_over_8888_0565),
+    PIXMAN_STD_FAST_PATH (ADD, r5g6b5, null, r5g6b5, rvv_composite_add_0565_0565),
+    PIXMAN_STD_FAST_PATH (ADD, b5g6r5, null, b5g6r5, rvv_composite_add_0565_0565),
+    PIXMAN_STD_FAST_PATH (ADD, a8r8g8b8, null, a8r8g8b8, rvv_composite_add_8888_8888),
+    PIXMAN_STD_FAST_PATH (ADD, a8b8g8r8, null, a8b8g8r8, rvv_composite_add_8888_8888),
+    PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, rvv_composite_add_8_8),
+    // PIXMAN_STD_FAST_PATH (ADD, a1, null, a1, fast_composite_add_1_1),
+    PIXMAN_STD_FAST_PATH_CA (ADD, solid, a8r8g8b8, a8r8g8b8, rvv_composite_add_n_8888_8888_ca),
+    PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8, rvv_composite_add_n_8_8),
+    PIXMAN_STD_FAST_PATH (SRC, solid, null, a8r8g8b8, rvv_composite_solid_fill),
+    PIXMAN_STD_FAST_PATH (SRC, solid, null, x8r8g8b8, rvv_composite_solid_fill),
+    PIXMAN_STD_FAST_PATH (SRC, solid, null, a8b8g8r8, rvv_composite_solid_fill),
+    PIXMAN_STD_FAST_PATH (SRC, solid, null, x8b8g8r8, rvv_composite_solid_fill),
+    PIXMAN_STD_FAST_PATH (SRC, solid, null, a1, rvv_composite_solid_fill),
+    PIXMAN_STD_FAST_PATH (SRC, solid, null, a8, rvv_composite_solid_fill),
+    PIXMAN_STD_FAST_PATH (SRC, solid, null, r5g6b5, rvv_composite_solid_fill),
+    PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, a8r8g8b8, rvv_composite_src_x888_8888),
+    PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, a8b8g8r8, rvv_composite_src_x888_8888),
+    PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, x8r8g8b8, rvv_composite_src_8888_8888),
+    PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, a8r8g8b8, rvv_composite_src_8888_8888),
+    PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, x8r8g8b8, rvv_composite_src_8888_8888),
+    PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, x8b8g8r8, rvv_composite_src_8888_8888),
+    PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, a8b8g8r8, rvv_composite_src_8888_8888),
+    PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, x8b8g8r8, rvv_composite_src_8888_8888),
+    PIXMAN_STD_FAST_PATH (SRC, b8g8r8a8, null, b8g8r8x8, rvv_composite_src_8888_8888),
+    PIXMAN_STD_FAST_PATH (SRC, b8g8r8a8, null, b8g8r8a8, rvv_composite_src_8888_8888),
+    PIXMAN_STD_FAST_PATH (SRC, b8g8r8x8, null, b8g8r8x8, rvv_composite_src_8888_8888),
+    PIXMAN_STD_FAST_PATH (SRC, r5g6b5, null, r5g6b5, rvv_composite_src_memcpy),
+    PIXMAN_STD_FAST_PATH (SRC, b5g6r5, null, b5g6r5, rvv_composite_src_memcpy),
+    PIXMAN_STD_FAST_PATH (SRC, r8g8b8, null, r8g8b8, rvv_composite_src_memcpy),
+    PIXMAN_STD_FAST_PATH (SRC, b8g8r8, null, b8g8r8, rvv_composite_src_memcpy),
+    PIXMAN_STD_FAST_PATH (SRC, x1r5g5b5, null, x1r5g5b5, rvv_composite_src_memcpy),
+    PIXMAN_STD_FAST_PATH (SRC, a1r5g5b5, null, x1r5g5b5, rvv_composite_src_memcpy),
+    PIXMAN_STD_FAST_PATH (SRC, a8, null, a8, rvv_composite_src_memcpy),
+    PIXMAN_STD_FAST_PATH (IN, a8, null, a8, rvv_composite_in_8_8),
+    PIXMAN_STD_FAST_PATH (IN, solid, a8, a8, rvv_composite_in_n_8_8),
+    PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, null, x8r8g8b8, rvv_composite_src_8888_8888),
+    PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, null, x8b8g8r8, rvv_composite_src_8888_8888),
+
+    {PIXMAN_OP_NONE},
+};
+
+pixman_implementation_t *
+_pixman_implementation_create_rvv (pixman_implementation_t *fallback)
+{
+    pixman_implementation_t *imp = _pixman_implementation_create (
+	fallback, rvv_fast_paths);
+
+    // clang-format off
+    imp->combine_float[PIXMAN_OP_CLEAR] = rvv_combine_clear_u_float;
+    imp->combine_float[PIXMAN_OP_SRC] = rvv_combine_src_u_float;
+    imp->combine_float[PIXMAN_OP_DST] = rvv_combine_dst_u_float;
+    imp->combine_float[PIXMAN_OP_OVER] = rvv_combine_over_u_float;
+    imp->combine_float[PIXMAN_OP_OVER_REVERSE] = rvv_combine_over_reverse_u_float;
+    imp->combine_float[PIXMAN_OP_IN] = rvv_combine_in_u_float;
+    imp->combine_float[PIXMAN_OP_IN_REVERSE] = rvv_combine_in_reverse_u_float;
+    imp->combine_float[PIXMAN_OP_OUT] = rvv_combine_out_u_float;
+    imp->combine_float[PIXMAN_OP_OUT_REVERSE] = rvv_combine_out_reverse_u_float;
+    imp->combine_float[PIXMAN_OP_ATOP] = rvv_combine_atop_u_float;
+    imp->combine_float[PIXMAN_OP_ATOP_REVERSE] = rvv_combine_atop_reverse_u_float;
+    imp->combine_float[PIXMAN_OP_XOR] = rvv_combine_xor_u_float;
+    imp->combine_float[PIXMAN_OP_ADD] = rvv_combine_add_u_float;
+    imp->combine_float[PIXMAN_OP_SATURATE] = rvv_combine_saturate_u_float;
+
+    /* Disjoint, unified */
+    imp->combine_float[PIXMAN_OP_DISJOINT_CLEAR] = rvv_combine_disjoint_clear_u_float;
+    imp->combine_float[PIXMAN_OP_DISJOINT_SRC] = rvv_combine_disjoint_src_u_float;
+    imp->combine_float[PIXMAN_OP_DISJOINT_DST] = rvv_combine_disjoint_dst_u_float;
+    imp->combine_float[PIXMAN_OP_DISJOINT_OVER] = rvv_combine_disjoint_over_u_float;
+    imp->combine_float[PIXMAN_OP_DISJOINT_OVER_REVERSE] = rvv_combine_disjoint_over_reverse_u_float;
+    imp->combine_float[PIXMAN_OP_DISJOINT_IN] = rvv_combine_disjoint_in_u_float;
+    imp->combine_float[PIXMAN_OP_DISJOINT_IN_REVERSE] = rvv_combine_disjoint_in_reverse_u_float;
+    imp->combine_float[PIXMAN_OP_DISJOINT_OUT] = rvv_combine_disjoint_out_u_float;
+    imp->combine_float[PIXMAN_OP_DISJOINT_OUT_REVERSE] = rvv_combine_disjoint_out_reverse_u_float;
+    imp->combine_float[PIXMAN_OP_DISJOINT_ATOP] = rvv_combine_disjoint_atop_u_float;
+    imp->combine_float[PIXMAN_OP_DISJOINT_ATOP_REVERSE] = rvv_combine_disjoint_atop_reverse_u_float;
+    imp->combine_float[PIXMAN_OP_DISJOINT_XOR] = rvv_combine_disjoint_xor_u_float;
+
+    /* Conjoint, unified */
+    imp->combine_float[PIXMAN_OP_CONJOINT_CLEAR] = rvv_combine_conjoint_clear_u_float;
+    imp->combine_float[PIXMAN_OP_CONJOINT_SRC] = rvv_combine_conjoint_src_u_float;
+    imp->combine_float[PIXMAN_OP_CONJOINT_DST] = rvv_combine_conjoint_dst_u_float;
+    imp->combine_float[PIXMAN_OP_CONJOINT_OVER] = rvv_combine_conjoint_over_u_float;
+    imp->combine_float[PIXMAN_OP_CONJOINT_OVER_REVERSE] = rvv_combine_conjoint_over_reverse_u_float;
+    imp->combine_float[PIXMAN_OP_CONJOINT_IN] = rvv_combine_conjoint_in_u_float;
+    imp->combine_float[PIXMAN_OP_CONJOINT_IN_REVERSE] = rvv_combine_conjoint_in_reverse_u_float;
+    imp->combine_float[PIXMAN_OP_CONJOINT_OUT] = rvv_combine_conjoint_out_u_float;
+    imp->combine_float[PIXMAN_OP_CONJOINT_OUT_REVERSE] = rvv_combine_conjoint_out_reverse_u_float;
+    imp->combine_float[PIXMAN_OP_CONJOINT_ATOP] = rvv_combine_conjoint_atop_u_float;
+    imp->combine_float[PIXMAN_OP_CONJOINT_ATOP_REVERSE] = rvv_combine_conjoint_atop_reverse_u_float;
+    imp->combine_float[PIXMAN_OP_CONJOINT_XOR] = rvv_combine_conjoint_xor_u_float;
+
+    /* PDF operators, unified */
+    imp->combine_float[PIXMAN_OP_MULTIPLY] = rvv_combine_multiply_u_float;
+    imp->combine_float[PIXMAN_OP_SCREEN] = rvv_combine_screen_u_float;
+    imp->combine_float[PIXMAN_OP_OVERLAY] = rvv_combine_overlay_u_float;
+    imp->combine_float[PIXMAN_OP_DARKEN] = rvv_combine_darken_u_float;
+    imp->combine_float[PIXMAN_OP_LIGHTEN] = rvv_combine_lighten_u_float;
+    imp->combine_float[PIXMAN_OP_HARD_LIGHT] = rvv_combine_hard_light_u_float;
+    imp->combine_float[PIXMAN_OP_SOFT_LIGHT] = rvv_combine_soft_light_u_float;
+    imp->combine_float[PIXMAN_OP_DIFFERENCE] = rvv_combine_difference_u_float;
+    imp->combine_float[PIXMAN_OP_EXCLUSION] = rvv_combine_exclusion_u_float;
+    imp->combine_float[PIXMAN_OP_COLOR_DODGE] = rvv_combine_color_dodge_u_float;
+    imp->combine_float[PIXMAN_OP_COLOR_BURN] = rvv_combine_color_burn_u_float;
+
+    /* Component alpha combiners */
+    imp->combine_float_ca[PIXMAN_OP_CLEAR] = rvv_combine_clear_ca_float;
+    imp->combine_float_ca[PIXMAN_OP_SRC] = rvv_combine_src_ca_float;
+    imp->combine_float_ca[PIXMAN_OP_DST] = rvv_combine_dst_ca_float;
+    imp->combine_float_ca[PIXMAN_OP_OVER] = rvv_combine_over_ca_float;
+    imp->combine_float_ca[PIXMAN_OP_OVER_REVERSE] = rvv_combine_over_reverse_ca_float;
+    imp->combine_float_ca[PIXMAN_OP_IN] = rvv_combine_in_ca_float;
+    imp->combine_float_ca[PIXMAN_OP_IN_REVERSE] = rvv_combine_in_reverse_ca_float;
+    imp->combine_float_ca[PIXMAN_OP_OUT] = rvv_combine_out_ca_float;
+    imp->combine_float_ca[PIXMAN_OP_OUT_REVERSE] = rvv_combine_out_reverse_ca_float;
+    imp->combine_float_ca[PIXMAN_OP_ATOP] = rvv_combine_atop_ca_float;
+    imp->combine_float_ca[PIXMAN_OP_ATOP_REVERSE] = rvv_combine_atop_reverse_ca_float;
+    imp->combine_float_ca[PIXMAN_OP_XOR] = rvv_combine_xor_ca_float;
+    imp->combine_float_ca[PIXMAN_OP_ADD] = rvv_combine_add_ca_float;
+    imp->combine_float_ca[PIXMAN_OP_SATURATE] = rvv_combine_saturate_ca_float;
+
+    /* Disjoint CA */
+    imp->combine_float_ca[PIXMAN_OP_DISJOINT_CLEAR] = rvv_combine_disjoint_clear_ca_float;
+    imp->combine_float_ca[PIXMAN_OP_DISJOINT_SRC] = rvv_combine_disjoint_src_ca_float;
+    imp->combine_float_ca[PIXMAN_OP_DISJOINT_DST] = rvv_combine_disjoint_dst_ca_float;
+    imp->combine_float_ca[PIXMAN_OP_DISJOINT_OVER] = rvv_combine_disjoint_over_ca_float;
+    imp->combine_float_ca[PIXMAN_OP_DISJOINT_OVER_REVERSE] = rvv_combine_disjoint_over_reverse_ca_float;
+    imp->combine_float_ca[PIXMAN_OP_DISJOINT_IN] = rvv_combine_disjoint_in_ca_float;
+    imp->combine_float_ca[PIXMAN_OP_DISJOINT_IN_REVERSE] = rvv_combine_disjoint_in_reverse_ca_float;
+    imp->combine_float_ca[PIXMAN_OP_DISJOINT_OUT] = rvv_combine_disjoint_out_ca_float;
+    imp->combine_float_ca[PIXMAN_OP_DISJOINT_OUT_REVERSE] = rvv_combine_disjoint_out_reverse_ca_float;
+    imp->combine_float_ca[PIXMAN_OP_DISJOINT_ATOP] = rvv_combine_disjoint_atop_ca_float;
+    imp->combine_float_ca[PIXMAN_OP_DISJOINT_ATOP_REVERSE] = rvv_combine_disjoint_atop_reverse_ca_float;
+    imp->combine_float_ca[PIXMAN_OP_DISJOINT_XOR] = rvv_combine_disjoint_xor_ca_float;
+
+    /* Conjoint CA */
+    imp->combine_float_ca[PIXMAN_OP_CONJOINT_CLEAR] = rvv_combine_conjoint_clear_ca_float;
+    imp->combine_float_ca[PIXMAN_OP_CONJOINT_SRC] = rvv_combine_conjoint_src_ca_float;
+    imp->combine_float_ca[PIXMAN_OP_CONJOINT_DST] = rvv_combine_conjoint_dst_ca_float;
+    imp->combine_float_ca[PIXMAN_OP_CONJOINT_OVER] = rvv_combine_conjoint_over_ca_float;
+    imp->combine_float_ca[PIXMAN_OP_CONJOINT_OVER_REVERSE] = rvv_combine_conjoint_over_reverse_ca_float;
+    imp->combine_float_ca[PIXMAN_OP_CONJOINT_IN] = rvv_combine_conjoint_in_ca_float;
+    imp->combine_float_ca[PIXMAN_OP_CONJOINT_IN_REVERSE] = rvv_combine_conjoint_in_reverse_ca_float;
+    imp->combine_float_ca[PIXMAN_OP_CONJOINT_OUT] = rvv_combine_conjoint_out_ca_float;
+    imp->combine_float_ca[PIXMAN_OP_CONJOINT_OUT_REVERSE] = rvv_combine_conjoint_out_reverse_ca_float;
+    imp->combine_float_ca[PIXMAN_OP_CONJOINT_ATOP] = rvv_combine_conjoint_atop_ca_float;
+    imp->combine_float_ca[PIXMAN_OP_CONJOINT_ATOP_REVERSE] = rvv_combine_conjoint_atop_reverse_ca_float;
+    imp->combine_float_ca[PIXMAN_OP_CONJOINT_XOR] = rvv_combine_conjoint_xor_ca_float;
+
+    /* PDF operators CA */
+    imp->combine_float_ca[PIXMAN_OP_MULTIPLY] = rvv_combine_multiply_ca_float;
+    imp->combine_float_ca[PIXMAN_OP_SCREEN] = rvv_combine_screen_ca_float;
+    imp->combine_float_ca[PIXMAN_OP_OVERLAY] = rvv_combine_overlay_ca_float;
+    imp->combine_float_ca[PIXMAN_OP_DARKEN] = rvv_combine_darken_ca_float;
+    imp->combine_float_ca[PIXMAN_OP_LIGHTEN] = rvv_combine_lighten_ca_float;
+    imp->combine_float_ca[PIXMAN_OP_COLOR_DODGE] = rvv_combine_color_dodge_ca_float;
+    imp->combine_float_ca[PIXMAN_OP_COLOR_BURN] = rvv_combine_color_burn_ca_float;
+    imp->combine_float_ca[PIXMAN_OP_HARD_LIGHT] = rvv_combine_hard_light_ca_float;
+    imp->combine_float_ca[PIXMAN_OP_SOFT_LIGHT] = rvv_combine_soft_light_ca_float;
+    imp->combine_float_ca[PIXMAN_OP_DIFFERENCE] = rvv_combine_difference_ca_float;
+    imp->combine_float_ca[PIXMAN_OP_EXCLUSION] = rvv_combine_exclusion_ca_float;
+
+    /* It is not clear that these make sense, so make them noops for now */
+    imp->combine_float_ca[PIXMAN_OP_HSL_HUE] = rvv_combine_dst_u_float;
+    imp->combine_float_ca[PIXMAN_OP_HSL_SATURATION] = rvv_combine_dst_u_float;
+    imp->combine_float_ca[PIXMAN_OP_HSL_COLOR] = rvv_combine_dst_u_float;
+    imp->combine_float_ca[PIXMAN_OP_HSL_LUMINOSITY] = rvv_combine_dst_u_float;
+
+    /* Set up function pointers */
+    imp->combine_32[PIXMAN_OP_CLEAR] = rvv_combine_clear;
+    imp->combine_32[PIXMAN_OP_SRC] = rvv_combine_src_u;
+    imp->combine_32[PIXMAN_OP_OVER] = rvv_combine_over_u;
+    imp->combine_32[PIXMAN_OP_OVER_REVERSE] = rvv_combine_over_reverse_u;
+    imp->combine_32[PIXMAN_OP_IN] = rvv_combine_in_u;
+    imp->combine_32[PIXMAN_OP_IN_REVERSE] = rvv_combine_in_reverse_u;
+    imp->combine_32[PIXMAN_OP_OUT] = rvv_combine_out_u;
+    imp->combine_32[PIXMAN_OP_OUT_REVERSE] = rvv_combine_out_reverse_u;
+    imp->combine_32[PIXMAN_OP_ATOP] = rvv_combine_atop_u;
+    imp->combine_32[PIXMAN_OP_ATOP_REVERSE] = rvv_combine_atop_reverse_u;
+    imp->combine_32[PIXMAN_OP_XOR] = rvv_combine_xor_u;
+    imp->combine_32[PIXMAN_OP_ADD] = rvv_combine_add_u;
+
+    imp->combine_32[PIXMAN_OP_MULTIPLY] = rvv_combine_multiply_u;
+    imp->combine_32[PIXMAN_OP_SCREEN] = rvv_combine_screen_u;
+    imp->combine_32[PIXMAN_OP_OVERLAY] = rvv_combine_overlay_u;
+    imp->combine_32[PIXMAN_OP_DARKEN] = rvv_combine_darken_u;
+    imp->combine_32[PIXMAN_OP_LIGHTEN] = rvv_combine_lighten_u;
+    imp->combine_32[PIXMAN_OP_HARD_LIGHT] = rvv_combine_hard_light_u;
+    imp->combine_32[PIXMAN_OP_DIFFERENCE] = rvv_combine_difference_u;
+    imp->combine_32[PIXMAN_OP_EXCLUSION] = rvv_combine_exclusion_u;
+
+    imp->combine_32_ca[PIXMAN_OP_CLEAR] = rvv_combine_clear;
+    imp->combine_32_ca[PIXMAN_OP_SRC] = rvv_combine_src_ca;
+    imp->combine_32_ca[PIXMAN_OP_OVER] = rvv_combine_over_ca;
+    imp->combine_32_ca[PIXMAN_OP_OVER_REVERSE] = rvv_combine_over_reverse_ca;
+    imp->combine_32_ca[PIXMAN_OP_IN] = rvv_combine_in_ca;
+    imp->combine_32_ca[PIXMAN_OP_IN_REVERSE] = rvv_combine_in_reverse_ca;
+    imp->combine_32_ca[PIXMAN_OP_OUT] = rvv_combine_out_ca;
+    imp->combine_32_ca[PIXMAN_OP_OUT_REVERSE] = rvv_combine_out_reverse_ca;
+    imp->combine_32_ca[PIXMAN_OP_ATOP] = rvv_combine_atop_ca;
+    imp->combine_32_ca[PIXMAN_OP_ATOP_REVERSE] = rvv_combine_atop_reverse_ca;
+    imp->combine_32_ca[PIXMAN_OP_XOR] = rvv_combine_xor_ca;
+    imp->combine_32_ca[PIXMAN_OP_ADD] = rvv_combine_add_ca;
+
+    imp->combine_32_ca[PIXMAN_OP_MULTIPLY] = rvv_combine_multiply_ca;
+    imp->combine_32_ca[PIXMAN_OP_SCREEN] = rvv_combine_screen_ca;
+    imp->combine_32_ca[PIXMAN_OP_OVERLAY] = rvv_combine_overlay_ca;
+    imp->combine_32_ca[PIXMAN_OP_DARKEN] = rvv_combine_darken_ca;
+    imp->combine_32_ca[PIXMAN_OP_LIGHTEN] = rvv_combine_lighten_ca;
+    imp->combine_32_ca[PIXMAN_OP_HARD_LIGHT] = rvv_combine_hard_light_ca;
+    imp->combine_32_ca[PIXMAN_OP_DIFFERENCE] = rvv_combine_difference_ca;
+    imp->combine_32_ca[PIXMAN_OP_EXCLUSION] = rvv_combine_exclusion_ca;
+
+    imp->fill = rvv_fill;
+    imp->blt = rvv_blt;
 
     return imp;
 }
commit 59d9bf27c7011a39b0f46f3087bcd9dc7b0df206
Author: Marek Pikuła <m.pikula at partner.samsung.com>
Date:   Wed Apr 9 21:33:48 2025 +0200

    rvv: Add float suffix to float implementation
    
    To prevent name clashes with the upcoming integer implementation, the
    float-specific functions are renamed to include a `_float` suffix.
    
    Signed-off-by: Marek Pikuła <m.pikula at partner.samsung.com>

diff --git a/pixman/pixman-rvv.c b/pixman/pixman-rvv.c
index 2a9d344..478fa62 100644
--- a/pixman/pixman-rvv.c
+++ b/pixman/pixman-rvv.c
@@ -49,11 +49,11 @@
  */
 
 static force_inline vfloat32m1_t
-rvv_blend_screen (const vfloat32m1_t sa,
-		  const vfloat32m1_t s,
-		  const vfloat32m1_t da,
-		  const vfloat32m1_t d,
-		  size_t             vl)
+rvv_blend_screen_float (const vfloat32m1_t sa,
+			const vfloat32m1_t s,
+			const vfloat32m1_t da,
+			const vfloat32m1_t d,
+			size_t             vl)
 {
     vfloat32m1_t t0, t1, t2;
     t0 = __riscv_vfmul_vv_f32m1 (s, da, vl);
@@ -72,11 +72,11 @@ rvv_blend_screen (const vfloat32m1_t sa,
  */
 
 static force_inline vfloat32m1_t
-rvv_blend_multiply (const vfloat32m1_t sa,
-		    const vfloat32m1_t s,
-		    const vfloat32m1_t da,
-		    const vfloat32m1_t d,
-		    size_t             vl)
+rvv_blend_multiply_float (const vfloat32m1_t sa,
+			  const vfloat32m1_t s,
+			  const vfloat32m1_t da,
+			  const vfloat32m1_t d,
+			  size_t             vl)
 {
     return __riscv_vfmul_vv_f32m1 (s, d, vl);
 }
@@ -105,11 +105,11 @@ rvv_blend_multiply (const vfloat32m1_t sa,
  */
 
 static force_inline vfloat32m1_t
-rvv_blend_overlay (const vfloat32m1_t sa,
-		   const vfloat32m1_t s,
-		   const vfloat32m1_t da,
-		   const vfloat32m1_t d,
-		   size_t             vl)
+rvv_blend_overlay_float (const vfloat32m1_t sa,
+			 const vfloat32m1_t s,
+			 const vfloat32m1_t da,
+			 const vfloat32m1_t d,
+			 size_t             vl)
 {
     vfloat32m1_t t0, t1, t2, t3, t4, f0, f1, f2;
     vbool32_t    vb;
@@ -134,11 +134,11 @@ rvv_blend_overlay (const vfloat32m1_t sa,
  */
 
 static force_inline vfloat32m1_t
-rvv_blend_darken (const vfloat32m1_t sa,
-		  const vfloat32m1_t s,
-		  const vfloat32m1_t da,
-		  const vfloat32m1_t d,
-		  size_t             vl)
+rvv_blend_darken_float (const vfloat32m1_t sa,
+			const vfloat32m1_t s,
+			const vfloat32m1_t da,
+			const vfloat32m1_t d,
+			size_t             vl)
 {
     vfloat32m1_t ss, dd;
     vbool32_t    vb;
@@ -157,11 +157,11 @@ rvv_blend_darken (const vfloat32m1_t sa,
  */
 
 static force_inline vfloat32m1_t
-rvv_blend_lighten (const vfloat32m1_t sa,
-		   const vfloat32m1_t s,
-		   const vfloat32m1_t da,
-		   const vfloat32m1_t d,
-		   size_t             vl)
+rvv_blend_lighten_float (const vfloat32m1_t sa,
+			 const vfloat32m1_t s,
+			 const vfloat32m1_t da,
+			 const vfloat32m1_t d,
+			 size_t             vl)
 {
     vfloat32m1_t ss, dd;
     vbool32_t    vb;
@@ -191,11 +191,11 @@ rvv_blend_lighten (const vfloat32m1_t sa,
  */
 
 static force_inline vfloat32m1_t
-rvv_blend_color_dodge (const vfloat32m1_t sa,
-		       const vfloat32m1_t s,
-		       const vfloat32m1_t da,
-		       const vfloat32m1_t d,
-		       size_t             vl)
+rvv_blend_color_dodge_float (const vfloat32m1_t sa,
+			     const vfloat32m1_t s,
+			     const vfloat32m1_t da,
+			     const vfloat32m1_t d,
+			     size_t             vl)
 {
     vfloat32m1_t t0, t1, t2, t3, t4;
     vbool32_t    is_d_zero, vb, is_t0_non_zero;
@@ -241,11 +241,11 @@ rvv_blend_color_dodge (const vfloat32m1_t sa,
  */
 
 static force_inline vfloat32m1_t
-rvv_blend_color_burn (const vfloat32m1_t sa,
-		      const vfloat32m1_t s,
-		      const vfloat32m1_t da,
-		      const vfloat32m1_t d,
-		      size_t             vl)
+rvv_blend_color_burn_float (const vfloat32m1_t sa,
+			    const vfloat32m1_t s,
+			    const vfloat32m1_t da,
+			    const vfloat32m1_t d,
+			    size_t             vl)
 {
     vfloat32m1_t t0, t1, t2, t3, t4, t5, t6, t7;
     vbool32_t    is_d_ge_da, is_s_zero, vb;
@@ -289,11 +289,11 @@ rvv_blend_color_burn (const vfloat32m1_t sa,
  */
 
 static force_inline vfloat32m1_t
-rvv_blend_hard_light (const vfloat32m1_t sa,
-		      const vfloat32m1_t s,
-		      const vfloat32m1_t da,
-		      const vfloat32m1_t d,
-		      size_t             vl)
+rvv_blend_hard_light_float (const vfloat32m1_t sa,
+			    const vfloat32m1_t s,
+			    const vfloat32m1_t da,
+			    const vfloat32m1_t d,
+			    size_t             vl)
 {
     vfloat32m1_t t0, t1, t2, t3, t4;
     vbool32_t    vb;
@@ -328,11 +328,11 @@ rvv_blend_hard_light (const vfloat32m1_t sa,
  */
 
 static force_inline vfloat32m1_t
-rvv_blend_soft_light (const vfloat32m1_t sa,
-		      const vfloat32m1_t s,
-		      const vfloat32m1_t da,
-		      const vfloat32m1_t d,
-		      size_t             vl)
+rvv_blend_soft_light_float (const vfloat32m1_t sa,
+			    const vfloat32m1_t s,
+			    const vfloat32m1_t da,
+			    const vfloat32m1_t d,
+			    size_t             vl)
 {
     vfloat32m1_t t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13;
     vbool32_t    is_sa_lt_2s, is_da_ls_4d, is_da_non_zero;
@@ -396,11 +396,11 @@ rvv_blend_soft_light (const vfloat32m1_t sa,
  */
 
 static force_inline vfloat32m1_t
-rvv_blend_difference (const vfloat32m1_t sa,
-		      const vfloat32m1_t s,
-		      const vfloat32m1_t da,
-		      const vfloat32m1_t d,
-		      size_t             vl)
+rvv_blend_difference_float (const vfloat32m1_t sa,
+			    const vfloat32m1_t s,
+			    const vfloat32m1_t da,
+			    const vfloat32m1_t d,
+			    size_t             vl)
 {
     vfloat32m1_t dsa, sda;
     vbool32_t    vb;
@@ -421,11 +421,11 @@ rvv_blend_difference (const vfloat32m1_t sa,
  */
 
 static force_inline vfloat32m1_t
-rvv_blend_exclusion (const vfloat32m1_t sa,
-		     const vfloat32m1_t s,
-		     const vfloat32m1_t da,
-		     const vfloat32m1_t d,
-		     size_t             vl)
+rvv_blend_exclusion_float (const vfloat32m1_t sa,
+			   const vfloat32m1_t s,
+			   const vfloat32m1_t da,
+			   const vfloat32m1_t d,
+			   size_t             vl)
 {
     vfloat32m1_t t0, t1;
     t0 = __riscv_vfmul_vv_f32m1 (__riscv_vfadd_vv_f32m1 (d, d, vl), s, vl);
@@ -441,13 +441,13 @@ typedef vfloat32m1_t (*rvv_combine_channel_float_t) (const vfloat32m1_t sa,
 						     size_t             vl);
 
 static force_inline void
-rvv_combine_inner (pixman_bool_t               component,
-		   float                      *dest,
-		   const float                *src,
-		   const float                *mask,
-		   int                         n_pixels,
-		   rvv_combine_channel_float_t combine_a,
-		   rvv_combine_channel_float_t combine_c)
+rvv_combine_inner_float (pixman_bool_t               component,
+			 float                      *dest,
+			 const float                *src,
+			 const float                *mask,
+			 int                         n_pixels,
+			 rvv_combine_channel_float_t combine_a,
+			 rvv_combine_channel_float_t combine_c)
 {
     float *__restrict__ pd       = dest;
     const float *__restrict__ ps = src;
@@ -617,8 +617,8 @@ rvv_combine_inner (pixman_bool_t               component,
 	pixman_implementation_t *imp, pixman_op_t op, float *dest,             \
 	const float *src, const float *mask, int n_pixels)                     \
     {                                                                          \
-	rvv_combine_inner (component, dest, src, mask, n_pixels, combine_a,    \
-			   combine_c);                                         \
+	rvv_combine_inner_float (component, dest, src, mask, n_pixels,         \
+				 combine_a, combine_c);                        \
     }
 
 #define RVV_MAKE_COMBINERS(name, combine_a, combine_c)                         \
@@ -626,10 +626,10 @@ rvv_combine_inner (pixman_bool_t               component,
     RVV_MAKE_COMBINER (name##_u, FALSE, combine_a, combine_c)
 
 static force_inline vfloat32m1_t
-rvv_get_factor (combine_factor_t factor,
-		vfloat32m1_t     sa,
-		vfloat32m1_t     da,
-		size_t           vl)
+rvv_get_factor_float (combine_factor_t factor,
+		      vfloat32m1_t     sa,
+		      vfloat32m1_t     da,
+		      size_t           vl)
 {
     vfloat32m1_t vone  = __riscv_vfmv_v_f_f32m1 (1.0f, vl);
     vfloat32m1_t vzero = __riscv_vfmv_v_f_f32m1 (0.0f, vl);
@@ -751,12 +751,12 @@ rvv_get_factor (combine_factor_t factor,
 }
 
 #define RVV_MAKE_PD_COMBINERS(name, a, b)                                      \
-    static vfloat32m1_t force_inline rvv_pd_combine_##name (                   \
+    static vfloat32m1_t force_inline rvv_pd_combine_##name##_float (           \
 	vfloat32m1_t sa, vfloat32m1_t s, vfloat32m1_t da, vfloat32m1_t d,      \
 	size_t vl)                                                             \
     {                                                                          \
-	const vfloat32m1_t fa = rvv_get_factor (a, sa, da, vl);                \
-	const vfloat32m1_t fb = rvv_get_factor (b, sa, da, vl);                \
+	const vfloat32m1_t fa = rvv_get_factor_float (a, sa, da, vl);          \
+	const vfloat32m1_t fb = rvv_get_factor_float (b, sa, da, vl);          \
 	vfloat32m1_t       t0 = __riscv_vfadd_vv_f32m1 (                       \
             __riscv_vfmul_vv_f32m1 (s, fa, vl),                          \
             __riscv_vfmul_vv_f32m1 (d, fb, vl), vl);                     \
@@ -764,7 +764,8 @@ rvv_get_factor (combine_factor_t factor,
 				       vl);                                    \
     }                                                                          \
                                                                                \
-    RVV_MAKE_COMBINERS (name, rvv_pd_combine_##name, rvv_pd_combine_##name)
+    RVV_MAKE_COMBINERS (name, rvv_pd_combine_##name##_float,                   \
+			rvv_pd_combine_##name##_float)
 
 RVV_MAKE_PD_COMBINERS (clear, ZERO, ZERO)
 RVV_MAKE_PD_COMBINERS (src, ONE, ZERO)
@@ -833,8 +834,8 @@ RVV_MAKE_PD_COMBINERS (conjoint_xor, ONE_MINUS_DA_OVER_SA, ONE_MINUS_SA_OVER_DA)
 		vl),                                                           \
 	    -1.0f, vl);                                                        \
                                                                                \
-	return __riscv_vfadd_vv_f32m1 (f, rvv_blend_##name (sa, s, da, d, vl), \
-				       vl);                                    \
+	return __riscv_vfadd_vv_f32m1 (                                        \
+	    f, rvv_blend_##name##_float (sa, s, da, d, vl), vl);               \
     }                                                                          \
                                                                                \
     RVV_MAKE_COMBINERS (name, rvv_combine_##name##_a, rvv_combine_##name##_c)


More information about the xorg-commit mailing list