pixman: Branch 'master' - 10 commits

Tue Apr 30 15:30:20 PDT 2013

pixman/pixman-mips-dspr2-asm.S |  723 +++++++++++++++++++++++++++++------------
 pixman/pixman-mips-dspr2-asm.h |   51 ++
 pixman/pixman-mips-dspr2.c     |   31 +
 pixman/pixman-mips-dspr2.h     |   42 ++
 pixman/refactor                |  478 ---------------------------
 test/lowlevel-blt-bench.c      |   13 
 6 files changed, 647 insertions(+), 691 deletions(-)

New commits:
commit 7fc2654a1fdd6d6c41eddaac50b3668433873679
Author: SÃ¸ren Sandmann Pedersen <ssp at redhat.com>
Date:   Sat Apr 27 04:27:39 2013 -0400

    pixman/refactor: Delete this file
    
    Essentially all of it is obsolete by now.

diff --git a/pixman/refactor b/pixman/refactor
deleted file mode 100644
index 65e207a..0000000
--- a/pixman/refactor
+++ /dev/null
@@ -1,478 +0,0 @@
-Roadmap
-
-- Move all the fetchers etc. into pixman-image to make pixman-compose.c
-  less intimidating.
-
-  DONE
-
-- Make combiners for unified alpha take a mask argument. That way
-  we won't need two separate paths for unified vs component in the
-  general compositing code.
-
-  DONE, except that the Altivec code needs to be updated. Luca is
-  looking into that.
-
-- Delete separate 'unified alpha' path
- 
-  DONE
-
-- Split images into their own files
-
-  DONE
-
-- Split the gradient walker code out into its own file
-
-  DONE
-
-- Add scanline getters per image
-
-  DONE
-
-- Generic 64 bit fetcher 
-
-  DONE
-
-- Split fast path tables into their respective architecture dependent
-  files.
-
-See "Render Algorithm" below for rationale
-
-Images will eventually have these virtual functions:
-
-       get_scanline()
-       get_scanline_wide()
-       get_pixel()
-       get_pixel_wide()
-       get_untransformed_pixel()
-       get_untransformed_pixel_wide()
-       get_unfiltered_pixel()
-       get_unfiltered_pixel_wide()
-
-       store_scanline()
-       store_scanline_wide()
-
-1.
-
-Initially we will just have get_scanline() and get_scanline_wide();
-these will be based on the ones in pixman-compose. Hopefully this will
-reduce the complexity in pixman_composite_rect_general().
-
-Note that there is access considerations - the compose function is
-being compiled twice.
-
-
-2.
-
-Split image types into their own source files. Export noop virtual
-reinit() call.  Call this whenever a property of the image changes.
-
-
-3. 
-
-Split the get_scanline() call into smaller functions that are
-initialized by the reinit() call.
-
-The Render Algorithm:
-	(first repeat, then filter, then transform, then clip)
-
-Starting from a destination pixel (x, y), do
-
-	1 x = x - xDst + xSrc
-	  y = y - yDst + ySrc
-
-	2 reject pixel that is outside the clip
-
-	This treats clipping as something that happens after
-	transformation, which I think is correct for client clips. For
-	hierarchy clips it is wrong, but who really cares? Without
-	GraphicsExposes hierarchy clips are basically irrelevant. Yes,
-	you could imagine cases where the pixels of a subwindow of a
-	redirected, transformed window should be treated as
-	transparent. I don't really care
-
-	Basically, I think the render spec should say that pixels that
-	are unavailable due to the hierarchy have undefined content,
-	and that GraphicsExposes are not generated. Ie., basically
-	that using non-redirected windows as sources is fail. This is
-	at least consistent with the current implementation and we can
-	update the spec later if someone makes it work.
-
-	The implication for render is that it should stop passing the
-	hierarchy clip to pixman. In pixman, if a souce image has a
-	clip it should be used in computing the composite region and
-	nowhere else, regardless of what "has_client_clip" says. The
-	default should be for there to not be any clip.
-
-	I would really like to get rid of the client clip as well for
-	source images, but unfortunately there is at least one
-	application in the wild that uses them.
-
-	3 Transform pixel: (x, y) = T(x, y)
-
-	4 Call p = GetUntransformedPixel (x, y)
-
-	5 If the image has an alpha map, then
-
-		Call GetUntransformedPixel (x, y) on the alpha map
-		
-		add resulting alpha channel to p
-
-	   return p
-
-	Where GetUnTransformedPixel is:
-
-	6 switch (filter)
-	  {
-	  case NEAREST:
-		return GetUnfilteredPixel (x, y);
-		break;
-
-	  case BILINEAR:
-		return GetUnfilteredPixel (...) // 4 times 
-		break;
-
-	  case CONVOLUTION:
-		return GetUnfilteredPixel (...) // as many times as necessary.
-		break;
-	  }
-
-	Where GetUnfilteredPixel (x, y) is
-
-	7 switch (repeat)
-	   {
-	   case REPEAT_NORMAL:
-	   case REPEAT_PAD:
-	   case REPEAT_REFLECT:
-		// adjust x, y as appropriate
-		break;
-
-	   case REPEAT_NONE:
-	        if (x, y) is outside image bounds
-		     return 0;
-		break;
-	   }
-
-	   return GetRawPixel(x, y)
-
-	Where GetRawPixel (x, y) is
-
-	8 Compute the pixel in question, depending on image type.
-
-For gradients, repeat has a totally different meaning, so
-UnfilteredPixel() and RawPixel() must be the same function so that
-gradients can do their own repeat algorithm.
-
-So, the GetRawPixel
-
-	for bits must deal with repeats
-	for gradients must deal with repeats (differently)
-	for solids, should ignore repeats.
-
-	for polygons, when we add them, either ignore repeats or do
-	something similar to bits (in which case, we may want an extra
-	layer of indirection to modify the coordinates).
-
-It is then possible to build things like "get scanline" or "get tile" on
-top of this. In the simplest case, just repeatedly calling GetPixel()
-would work, but specialized get_scanline()s or get_tile()s could be
-plugged in for common cases. 
-
-By not plugging anything in for images with access functions, we only
-have to compile the pixel functions twice, not the scanline functions.
-
-And we can get rid of fetchers for the bizarre formats that no one
-uses. Such as b2g3r3 etc. r1g2b1? Seriously? It is also worth
-considering a generic format based pixel fetcher for these edge cases.
-
-Since the actual routines depend on the image attributes, the images
-must be notified when those change and update their function pointers
-appropriately. So there should probably be a virtual function called
-(* reinit) or something like that.
-
-There will also be wide fetchers for both pixels and lines. The line
-fetcher will just call the wide pixel fetcher. The wide pixel fetcher
-will just call expand, except for 10 bit formats.
-
-Rendering pipeline:
-
-Drawable:
-	0. if (picture has alpha map)
-		0.1. Position alpha map according to the alpha_x/alpha_y
-	        0.2. Where the two drawables intersect, the alpha channel
-		     Replace the alpha channel of source with the one
-		     from the alpha map. Replacement only takes place
-		     in the intersection of the two drawables' geometries.
-	1. Repeat the drawable according to the repeat attribute
-	2. Reconstruct a continuous image according to the filter
-	3. Transform according to the transform attribute
-	4. Position image such that src_x, src_y is over dst_x, dst_y
-	5. Sample once per destination pixel 
-	6. Clip. If a pixel is not within the source clip, then no
-	   compositing takes place at that pixel. (Ie., it's *not*
-	   treated as 0).
-
-	Sampling a drawable: 
-
-	- If the channel does not have an alpha channel, the pixels in it
-	  are treated as opaque.
-
-	Note on reconstruction:
-
-	- The top left pixel has coordinates (0.5, 0.5) and pixels are
-	  spaced 1 apart.
-
-Gradient:
-	1. Unless gradient type is conical, repeat the underlying (0, 1)
-		gradient according to the repeat attribute
-	2. Integrate the gradient across the plane according to type.
-	3. Transform according to transform attribute
-	4. Position gradient 
-	5. Sample once per destination pixel.
- 	6. Clip
-
-Solid Fill:
-	1. Repeat has no effect
-	2. Image is already continuous and defined for the entire plane
-	3. Transform has no effect
-	4. Positioning has no effect
-	5. Sample once per destination pixel.
-	6. Clip
-
-Polygon:
-	1. Repeat has no effect
-	2. Image is already continuous and defined on the whole plane
-	3. Transform according to transform attribute
-	4. Position image
-	5. Supersample 15x17 per destination pixel.
-	6. Clip
-
-Possibly interesting additions:
-	- More general transformations, such as warping, or general
-	  shading.
-
-	- Shader image where a function is called to generate the
-          pixel (ie., uploading assembly code).
-
-	- Resampling kernels
-
-	  In principle the polygon image uses a 15x17 box filter for
-	  resampling. If we allow general resampling filters, then we
-	  get all the various antialiasing types for free. 
-
-	  Bilinear downsampling looks terrible and could be much 
-	  improved by a resampling filter. NEAREST reconstruction
-	  combined with a box resampling filter is what GdkPixbuf
-	  does, I believe.
-
-	  Useful for high frequency gradients as well.
-
-	  (Note that the difference between a reconstruction and a
-	  resampling filter is mainly where in the pipeline they
-	  occur. High quality resampling should use a correctly
-	  oriented kernel so it should happen after transformation.
-
-	  An implementation can transform the resampling kernel and
-	  convolve it with the reconstruction if it so desires, but it
-	  will need to deal with the fact that the resampling kernel
-	  will not necessarily be pixel aligned.
-
-	  "Output kernels"
-
-	  One could imagine doing the resampling after compositing,
-	  ie., for each destination pixel sample each source image 16
-	  times, then composite those subpixels individually, then
-	  finally apply a kernel.
-
-	  However, this is effectively the same as full screen
-	  antialiasing, which is a simpler way to think about it. So
-	  resampling kernels may make sense for individual images, but
-	  not as a post-compositing step.
-	  
-	  Fullscreen AA is inefficient without chained compositing
-	  though. Consider an (image scaled up to oversample size IN
-	  some polygon) scaled down to screen size. With the current
-	  implementation, there will be a huge temporary. With chained
-	  compositing, the whole thing ends up being equivalent to the
-	  output kernel from above.
-
-	- Color space conversion
-
-	  The complete model here is that each surface has a color
-	  space associated with it and that the compositing operation
-	  also has one associated with it. Note also that gradients
-	  should have associcated colorspaces.
-
-	- Dithering
-
-	  If people dither something that is already dithered, it will
-	  look terrible, but don't do that, then. (Dithering happens
-	  after resampling if at all - what is the relationship
-	  with color spaces? Presumably dithering should happen in linear
-	  intensity space).
-
-	- Floating point surfaces, 16, 32 and possibly 64 bit per
-	  channel.
-
-	Maybe crack:
-
-	- Glyph polygons
-
-	  If glyphs could be given as polygons, they could be
-	  positioned and rasterized more accurately. The glyph
-	  structure would need subpixel positioning though.
-
-	- Luminance vs. coverage for the alpha channel
-
-	  Whether the alpha channel should be interpreted as luminance
-          modulation or as coverage (intensity modulation). This is a
-          bit of a departure from the rendering model though. It could
-	  also be considered whether it should be possible to have 
-	  both channels in the same drawable.
-
-	- Alternative for component alpha
-
-	  - Set component-alpha on the output image.
-
-	    - This means each of the components are sampled
-	      independently and composited in the corresponding
-	      channel only.
-
-	  - Have 3 x oversampled mask
-
-	  - Scale it down by 3 horizontally, with [ 1/3, 1/3, 1/3 ]
-            resampling filter. 
-
-	    Is this equivalent to just using a component alpha mask?
-
-	Incompatible changes:
-
-	- Gradients could be specified with premultiplied colors. (You
-	  can use a mask to get things like gradients from solid red to
-	  transparent red.
-
-Refactoring pixman
-
-The pixman code is not particularly nice to put it mildly. Among the
-issues are
-
-- inconsistent naming style (fb vs Fb, camelCase vs
-  underscore_naming). Sometimes there is even inconsistency *within*
-  one name.
-
-      fetchProc32 ACCESS(pixman_fetchProcForPicture32)
-
-  may be one of the uglies names ever created.
-
-  coding style: 
-  	 use the one from cairo except that pixman uses this brace style:
-	 
-		while (blah)
-		{
-		}
-
-	Format do while like this:
-
-	       do 
-	       {
-
-	       } 
-	       while (...);
-
-- PIXMAN_COMPOSITE_RECT_GENERAL() is horribly complex
-
-- switch case logic in pixman-access.c
-
-  Instead it would be better to just store function pointers in the
-  image objects themselves,
-
-  	get_pixel()
-	get_scanline()
-
-- Much of the scanline fetching code is for formats that no one 
-  ever uses. a2r2g2b2 anyone?
-
-  It would probably be worthwhile having a generic fetcher for any
-  pixman format whatsoever.
-
-- Code related to particular image types should be split into individual
-  files.
-
-	pixman-bits-image.c
-	pixman-linear-gradient-image.c
-	pixman-radial-gradient-image.c
-	pixman-solid-image.c
-
-- Fast path code should be split into files based on architecture:
-
-       pixman-mmx-fastpath.c
-       pixman-sse2-fastpath.c
-       pixman-c-fastpath.c
-
-       etc.
-
-  Each of these files should then export a fastpath table, which would
-  be declared in pixman-private.h. This should allow us to get rid
-  of the pixman-mmx.h files.
-
-  The fast path table should describe each fast path. Ie there should
-  be bitfields indicating what things the fast path can handle, rather than
-  like now where it is only allowed to take one format per src/mask/dest. Ie., 
-
-  { 
-    FAST_a8r8g8b8 | FAST_x8r8g8b8,
-    FAST_null,
-    FAST_x8r8g8b8,
-    FAST_repeat_normal | FAST_repeat_none,
-    the_fast_path
-  }
-
-There should then be *one* file that implements pixman_image_composite(). 
-This should do this:
-
-     optimize_operator();
-
-     convert 1x1 repeat to solid (actually this should be done at
-     image creation time).
-     
-     is there a useful fastpath?
-
-There should be a file called pixman-cpu.c that contains all the
-architecture specific stuff to detect what CPU features we have.
-
-Issues that must be kept in mind:
-
-       - we need accessor code to be preserved
-
-       - maybe there should be a "store_scanline" too?
-
-         Is this sufficient?
-
-	 We should preserve the optimization where the
-	 compositing happens directly in the destination
-	 whenever possible.
-
-	- It should be possible to create GPU samplers from the
-	  images.
-
-The "horizontal" classification should be a bit in the image, the
-"vertical" classification should just happen inside the gradient
-file. Note though that
-
-      (a) these will change if the tranformation/repeat changes.
-
-      (b) at the moment the optimization for linear gradients
-          takes the source rectangle into account. Presumably
-	  this is to also optimize the case where the gradient
-	  is close enough to horizontal?
-
-Who is responsible for repeats? In principle it should be the scanline
-fetch. Right now NORMAL repeats are handled by walk_composite_region()
-while other repeats are handled by the scanline code.
-
-
-(Random note on filtering: do you filter before or after
-transformation?  Hardware is going to filter after transformation;
-this is also what pixman does currently). It's not completely clear
-what filtering *after* transformation means. One thing that might look
-good would be to do *supersampling*, ie., compute multiple subpixels
-per destination pixel, then average them together.
commit cb928a77c05a9c581e596b8eb24962d47fc39e9f
Author: Nemanja Lukic <nemanja.lukic at rt-rk.com>
Date:   Mon Apr 15 19:33:02 2013 +0200

    MIPS: DSPr2: Added rpixbuf fast path.
    
    Performance numbers before/after on MIPS-74kc @ 1GHz:
    
    lowlevel-blt-bench results
    
    Referent (before):
           rpixbuf =  L1:  14.63  L2:  13.55  M:  9.91 ( 79.53%)  HT:  8.47  VT:  8.32  R:  8.17  RT:  4.90 (  33Kops/s)
    
    Optimized:
           rpixbuf =  L1:  45.69  L2:  37.30  M: 17.24 (138.31%)  HT: 15.66  VT: 14.88  R: 13.97  RT:  8.38 (  44Kops/s)

diff --git a/pixman/pixman-mips-dspr2-asm.S b/pixman/pixman-mips-dspr2-asm.S
index 973b0e4..866e93e 100644
--- a/pixman/pixman-mips-dspr2-asm.S
+++ b/pixman/pixman-mips-dspr2-asm.S
@@ -761,6 +761,65 @@ LEAF_MIPS_DSPR2(pixman_composite_src_pixbuf_8888_asm_mips)
 
 END(pixman_composite_src_pixbuf_8888_asm_mips)
 
+LEAF_MIPS_DSPR2(pixman_composite_src_rpixbuf_8888_asm_mips)
+/*
+ * a0 - dst  (a8r8g8b8)
+ * a1 - src  (a8r8g8b8)
+ * a2 - w
+ */
+
+    SAVE_REGS_ON_STACK 0, v0
+    li       v0, 0x00ff00ff
+
+    beqz     a2, 3f
+     nop
+    addiu    t1, a2, -1
+    beqz     t1, 2f
+     nop
+1:
+    lw       t0, 0(a1)
+    lw       t1, 4(a1)
+    addiu    a1, a1, 8
+    addiu    a2, a2, -2
+    srl      t2, t0, 24
+    srl      t3, t1, 24
+
+    MIPS_2xUN8x4_MUL_2xUN8 t0, t1, t2, t3, t0, t1, v0, t4, t5, t6, t7, t8, t9
+
+    sll      t0, t0, 8
+    sll      t1, t1, 8
+    andi     t2, t2, 0xff
+    andi     t3, t3, 0xff
+    or       t0, t0, t2
+    or       t1, t1, t3
+    rotr     t0, t0, 8
+    rotr     t1, t1, 8
+    sw       t0, 0(a0)
+    sw       t1, 4(a0)
+
+    addiu    t2, a2, -1
+    bgtz     t2, 1b
+     addiu   a0, a0, 8
+2:
+    beqz     a2, 3f
+     nop
+    lw       t0, 0(a1)
+    srl      t1, t0, 24
+
+    MIPS_UN8x4_MUL_UN8 t0, t1, t0, v0, t3, t4, t5
+
+    sll      t0, t0, 8
+    andi     t1, t1, 0xff
+    or       t0, t0, t1
+    rotr     t0, t0, 8
+    sw       t0, 0(a0)
+3:
+    RESTORE_REGS_FROM_STACK 0, v0
+    j        ra
+     nop
+
+END(pixman_composite_src_rpixbuf_8888_asm_mips)
+
 LEAF_MIPS_DSPR2(pixman_composite_src_n_8_8888_asm_mips)
 /*
  * a0 - dst  (a8r8g8b8)
diff --git a/pixman/pixman-mips-dspr2.c b/pixman/pixman-mips-dspr2.c
index 2c2f515..e10c9df 100644
--- a/pixman/pixman-mips-dspr2.c
+++ b/pixman/pixman-mips-dspr2.c
@@ -56,6 +56,8 @@ PIXMAN_MIPS_BIND_FAST_PATH_SRC_DST (0, src_0888_0565_rev,
 #endif
 PIXMAN_MIPS_BIND_FAST_PATH_SRC_DST (0, src_pixbuf_8888,
                                     uint32_t, 1, uint32_t, 1)
+PIXMAN_MIPS_BIND_FAST_PATH_SRC_DST (0, src_rpixbuf_8888,
+                                    uint32_t, 1, uint32_t, 1)
 PIXMAN_MIPS_BIND_FAST_PATH_SRC_DST (0, over_8888_8888,
                                     uint32_t, 1, uint32_t, 1)
 PIXMAN_MIPS_BIND_FAST_PATH_SRC_DST (0, over_8888_0565,
@@ -302,6 +304,8 @@ static const pixman_fast_path_t mips_dspr2_fast_paths[] =
     PIXMAN_STD_FAST_PATH (SRC, b8g8r8,   null, r5g6b5,   mips_composite_src_0888_0565_rev),
 #endif
     PIXMAN_STD_FAST_PATH (SRC, pixbuf,   pixbuf,  a8r8g8b8, mips_composite_src_pixbuf_8888),
+    PIXMAN_STD_FAST_PATH (SRC, pixbuf,   pixbuf,  a8b8g8r8, mips_composite_src_rpixbuf_8888),
+    PIXMAN_STD_FAST_PATH (SRC, rpixbuf,  rpixbuf, a8r8g8b8, mips_composite_src_rpixbuf_8888),
     PIXMAN_STD_FAST_PATH (SRC, rpixbuf,  rpixbuf, a8b8g8r8, mips_composite_src_pixbuf_8888),
     PIXMAN_STD_FAST_PATH (SRC, solid,    a8,   a8r8g8b8, mips_composite_src_n_8_8888),
     PIXMAN_STD_FAST_PATH (SRC, solid,    a8,   x8r8g8b8, mips_composite_src_n_8_8888),
commit c6a6fbdcd3ef18f733ff7ad11d5fafac384744cd
Author: Nemanja Lukic <nemanja.lukic at rt-rk.com>
Date:   Mon Apr 15 19:33:01 2013 +0200

    MIPS: DSPr2: Added pixbuf fast path.
    
    Performance numbers before/after on MIPS-74kc @ 1GHz:
    
    lowlevel-blt-bench results
    
    Referent (before):
            pixbuf =  L1:  18.18  L2:  16.47  M: 13.36 (107.27%)  HT: 10.16  VT: 10.07  R:  9.84  RT:  5.54 (  35Kops/s)
    
    Optimized:
            pixbuf =  L1:  43.54  L2:  36.02  M: 17.08 (137.09%)  HT: 15.58  VT: 14.85  R: 13.87  RT:  8.38 (  44Kops/s)

diff --git a/pixman/pixman-mips-dspr2-asm.S b/pixman/pixman-mips-dspr2-asm.S
index 3a4d914..973b0e4 100644
--- a/pixman/pixman-mips-dspr2-asm.S
+++ b/pixman/pixman-mips-dspr2-asm.S
@@ -699,6 +699,68 @@ LEAF_MIPS_DSPR2(pixman_composite_src_0888_0565_rev_asm_mips)
 END(pixman_composite_src_0888_0565_rev_asm_mips)
 #endif
 
+LEAF_MIPS_DSPR2(pixman_composite_src_pixbuf_8888_asm_mips)
+/*
+ * a0 - dst  (a8b8g8r8)
+ * a1 - src  (a8r8g8b8)
+ * a2 - w
+ */
+
+    SAVE_REGS_ON_STACK 0, v0
+    li       v0, 0x00ff00ff
+
+    beqz     a2, 3f
+     nop
+    addiu    t1, a2, -1
+    beqz     t1, 2f
+     nop
+1:
+    lw       t0, 0(a1)
+    lw       t1, 4(a1)
+    addiu    a1, a1, 8
+    addiu    a2, a2, -2
+    srl      t2, t0, 24
+    srl      t3, t1, 24
+
+    MIPS_2xUN8x4_MUL_2xUN8 t0, t1, t2, t3, t0, t1, v0, t4, t5, t6, t7, t8, t9
+
+    sll      t0, t0, 8
+    sll      t1, t1, 8
+    andi     t2, t2, 0xff
+    andi     t3, t3, 0xff
+    or       t0, t0, t2
+    or       t1, t1, t3
+    wsbh     t0, t0
+    wsbh     t1, t1
+    rotr     t0, t0, 16
+    rotr     t1, t1, 16
+    sw       t0, 0(a0)
+    sw       t1, 4(a0)
+
+    addiu    t2, a2, -1
+    bgtz     t2, 1b
+     addiu   a0, a0, 8
+2:
+    beqz     a2, 3f
+     nop
+    lw       t0, 0(a1)
+    srl      t1, t0, 24
+
+    MIPS_UN8x4_MUL_UN8 t0, t1, t0, v0, t3, t4, t5
+
+    sll      t0, t0, 8
+    andi     t1, t1, 0xff
+    or       t0, t0, t1
+    wsbh     t0, t0
+    rotr     t0, t0, 16
+    sw       t0, 0(a0)
+3:
+    RESTORE_REGS_FROM_STACK 0, v0
+    j        ra
+     nop
+
+END(pixman_composite_src_pixbuf_8888_asm_mips)
+
 LEAF_MIPS_DSPR2(pixman_composite_src_n_8_8888_asm_mips)
 /*
  * a0 - dst  (a8r8g8b8)
diff --git a/pixman/pixman-mips-dspr2.c b/pixman/pixman-mips-dspr2.c
index 1949921..2c2f515 100644
--- a/pixman/pixman-mips-dspr2.c
+++ b/pixman/pixman-mips-dspr2.c
@@ -54,6 +54,8 @@ PIXMAN_MIPS_BIND_FAST_PATH_SRC_DST (0, src_0888_8888_rev,
 PIXMAN_MIPS_BIND_FAST_PATH_SRC_DST (0, src_0888_0565_rev,
                                     uint8_t, 3, uint16_t, 1)
 #endif
+PIXMAN_MIPS_BIND_FAST_PATH_SRC_DST (0, src_pixbuf_8888,
+                                    uint32_t, 1, uint32_t, 1)
 PIXMAN_MIPS_BIND_FAST_PATH_SRC_DST (0, over_8888_8888,
                                     uint32_t, 1, uint32_t, 1)
 PIXMAN_MIPS_BIND_FAST_PATH_SRC_DST (0, over_8888_0565,
@@ -299,6 +301,8 @@ static const pixman_fast_path_t mips_dspr2_fast_paths[] =
     PIXMAN_STD_FAST_PATH (SRC, b8g8r8,   null, x8r8g8b8, mips_composite_src_0888_8888_rev),
     PIXMAN_STD_FAST_PATH (SRC, b8g8r8,   null, r5g6b5,   mips_composite_src_0888_0565_rev),
 #endif
+    PIXMAN_STD_FAST_PATH (SRC, pixbuf,   pixbuf,  a8r8g8b8, mips_composite_src_pixbuf_8888),
+    PIXMAN_STD_FAST_PATH (SRC, rpixbuf,  rpixbuf, a8b8g8r8, mips_composite_src_pixbuf_8888),
     PIXMAN_STD_FAST_PATH (SRC, solid,    a8,   a8r8g8b8, mips_composite_src_n_8_8888),
     PIXMAN_STD_FAST_PATH (SRC, solid,    a8,   x8r8g8b8, mips_composite_src_n_8_8888),
     PIXMAN_STD_FAST_PATH (SRC, solid,    a8,   a8b8g8r8, mips_composite_src_n_8_8888),
commit f69335d5292310dc18f2f84d462430137a771976
Author: Nemanja Lukic <nemanja.lukic at rt-rk.com>
Date:   Mon Apr 15 19:33:00 2013 +0200

    test: add "pixbuf" and "rpixbuf" to lowlevel-blt-bench
    
    Add necessary support to lowlevel-blt benchmark for benchmarking pixbuf and
    rpixbuf fast paths. bench_composite function now checks for pixbuf string in
    testname, and if that is detected, use same bits for src and mask images.

diff --git a/test/lowlevel-blt-bench.c b/test/lowlevel-blt-bench.c
index a1657ea..1049e21 100644
--- a/test/lowlevel-blt-bench.c
+++ b/test/lowlevel-blt-bench.c
@@ -385,6 +385,7 @@ bench_composite (char * testname,
     double                          t1, t2, t3, pix_cnt;
     int64_t                         n, l1test_width, nlines;
     double                             bytes_per_pix = 0;
+    pixman_bool_t                   bench_pixbuf = FALSE;
 
     pixman_composite_func_t func = pixman_image_composite_wrapper;
 
@@ -422,16 +423,20 @@ bench_composite (char * testname,
 
     mask_img = NULL;
     xmask_img = NULL;
+    if (strcmp (testname, "pixbuf") == 0 || strcmp (testname, "rpixbuf") == 0)
+    {
+        bench_pixbuf = TRUE;
+    }
     if (!(mask_flags & SOLID_FLAG) && mask_fmt != PIXMAN_null)
     {
         bytes_per_pix += (mask_fmt >> 24) / ((op == PIXMAN_OP_SRC) ? 8.0 : 4.0);
         mask_img = pixman_image_create_bits (mask_fmt,
                                              WIDTH, HEIGHT,
-                                             mask,
+                                             bench_pixbuf ? src : mask,
                                              WIDTH * 4);
         xmask_img = pixman_image_create_bits (mask_fmt,
                                              XWIDTH, XHEIGHT,
-                                             mask,
+                                             bench_pixbuf ? src : mask,
                                              XWIDTH * 4);
     }
     else if (mask_fmt != PIXMAN_null)
@@ -709,6 +714,8 @@ tests_tbl[] =
     { "outrev_n_8888_x888_ca", PIXMAN_a8r8g8b8,    1, PIXMAN_OP_OUT_REV, PIXMAN_a8r8g8b8, 2, PIXMAN_x8r8g8b8 },
     { "outrev_n_8888_8888_ca", PIXMAN_a8r8g8b8,    1, PIXMAN_OP_OUT_REV, PIXMAN_a8r8g8b8, 2, PIXMAN_a8r8g8b8 },
     { "over_reverse_n_8888",   PIXMAN_a8r8g8b8,    0, PIXMAN_OP_OVER_REVERSE, PIXMAN_null, 0, PIXMAN_a8r8g8b8 },
+    { "pixbuf",                PIXMAN_x8b8g8r8,    0, PIXMAN_OP_SRC,     PIXMAN_a8b8g8r8, 0, PIXMAN_a8r8g8b8 },
+    { "rpixbuf",               PIXMAN_x8b8g8r8,    0, PIXMAN_OP_SRC,     PIXMAN_a8b8g8r8, 0, PIXMAN_a8b8g8r8 },
 };
 
 int
commit 3dc9e3827e342b415c519da1039b9a2e4fb293ec
Author: Nemanja Lukic <nemanja.lukic at rt-rk.com>
Date:   Mon Apr 15 19:32:59 2013 +0200

    test: add "src_0888_8888_rev" and "src_0888_0565_rev" to lowlevel-blt-bench

diff --git a/test/lowlevel-blt-bench.c b/test/lowlevel-blt-bench.c
index 4e16f7b..a1657ea 100644
--- a/test/lowlevel-blt-bench.c
+++ b/test/lowlevel-blt-bench.c
@@ -643,6 +643,8 @@ tests_tbl[] =
     { "src_0888_0565",         PIXMAN_r8g8b8,      0, PIXMAN_OP_SRC,     PIXMAN_null,     0, PIXMAN_r5g6b5 },
     { "src_0888_8888",         PIXMAN_r8g8b8,      0, PIXMAN_OP_SRC,     PIXMAN_null,     0, PIXMAN_a8r8g8b8 },
     { "src_0888_x888",         PIXMAN_r8g8b8,      0, PIXMAN_OP_SRC,     PIXMAN_null,     0, PIXMAN_x8r8g8b8 },
+    { "src_0888_8888_rev",     PIXMAN_b8g8r8,      0, PIXMAN_OP_SRC,     PIXMAN_null,     0, PIXMAN_x8r8g8b8 },
+    { "src_0888_0565_rev",     PIXMAN_b8g8r8,      0, PIXMAN_OP_SRC,     PIXMAN_null,     0, PIXMAN_r5g6b5 },
     { "src_x888_x888",         PIXMAN_x8r8g8b8,    0, PIXMAN_OP_SRC,     PIXMAN_null,     0, PIXMAN_x8r8g8b8 },
     { "src_x888_8888",         PIXMAN_x8r8g8b8,    0, PIXMAN_OP_SRC,     PIXMAN_null,     0, PIXMAN_a8r8g8b8 },
     { "src_8888_8888",         PIXMAN_a8r8g8b8,    0, PIXMAN_OP_SRC,     PIXMAN_null,     0, PIXMAN_a8r8g8b8 },
commit 44174ce51d1ed5a1bf988b9dd9218d8cbd379de3
Author: Nemanja Lukic <nemanja.lukic at rt-rk.com>
Date:   Mon Apr 15 19:32:58 2013 +0200

    MIPS: DSPr2: Fix for bug in in_n_8 routine.
    
    Rounding logic was not implemented right.
    Instead of using rounding version of the 8-bit shift, logical shifts were used.
    Also, code used unnecessary multiplications, which could be avoided by packing
    4 destination (a8) pixel into one 32bit register. There were also, unnecessary
    spills on stack. Code is rewritten to address mentioned issues.
    
    The bug was revealed by increasing number of the iterations in blitters-test.
    
    Performance numbers on MIPS-74kc @ 1GHz:
    
    lowlevel-blt-bench results
    
    Referent (before):
                       in_n_8 =  L1:  21.20  L2:  22.86  M: 21.42 ( 14.21%)  HT: 15.97  VT: 15.69  R: 15.47  RT:  8.00 (  48Kops/s)
    Optimized (first implementation, with bug):
                       in_n_8 =  L1:  89.38  L2:  86.07  M: 65.48 ( 43.44%)  HT: 44.64  VT: 41.50  R: 40.77  RT: 16.94 (  66Kops/s)
    Optimized (with bug fix, and code revisited):
                       in_n_8 =  L1: 102.33  L2:  95.65  M: 70.54 ( 46.84%)  HT: 48.35  VT: 45.06  R: 43.20  RT: 17.60 (  66Kops/s)

diff --git a/pixman/pixman-mips-dspr2-asm.S b/pixman/pixman-mips-dspr2-asm.S
index b94e66f..3a4d914 100644
--- a/pixman/pixman-mips-dspr2-asm.S
+++ b/pixman/pixman-mips-dspr2-asm.S
@@ -2974,96 +2974,74 @@ END(pixman_composite_over_reverse_n_8888_asm_mips)
 LEAF_MIPS_DSPR2(pixman_composite_in_n_8_asm_mips)
 /*
  * a0 - dst  (a8)
- * a1 - src  (a8r8g8b8)
+ * a1 - src  (32bit constant)
  * a2 - w
  */
 
-    beqz              a2, 5f
+    li                t9, 0x00ff00ff
+    beqz              a2, 3f
      nop
-
-    SAVE_REGS_ON_STACK 20, s0, s1, s2, s3, s4, s5, s6, s7
-    move              t7, a1
-    srl               t5, t7, 24
-    replv.ph          t5, t5
-    srl               t9, a2, 2   /* t1 = how many multiples of 4 src pixels */
-    beqz              t9, 2f      /* branch if less than 4 src pixels */
+    srl               t7, a2, 2   /* t7 = how many multiples of 4 dst pixels */
+    beqz              t7, 1f      /* branch if less than 4 src pixels */
      nop
 
-1:
-    addiu             t9, t9, -1
-    addiu             a2, a2, -4
+    srl               t8, a1, 24
+    replv.ph          t8, t8
+
+0:
+    beqz              t7, 1f
+     addiu            t7, t7, -1
     lbu               t0, 0(a0)
     lbu               t1, 1(a0)
     lbu               t2, 2(a0)
     lbu               t3, 3(a0)
 
-    muleu_s.ph.qbl    s0, t0, t5
-    muleu_s.ph.qbr    s1, t0, t5
-    muleu_s.ph.qbl    s2, t1, t5
-    muleu_s.ph.qbr    s3, t1, t5
-    muleu_s.ph.qbl    s4, t2, t5
-    muleu_s.ph.qbr    s5, t2, t5
-    muleu_s.ph.qbl    s6, t3, t5
-    muleu_s.ph.qbr    s7, t3, t5
-
-    shrl.ph           t4, s0, 8
-    shrl.ph           t6, s1, 8
-    shrl.ph           t7, s2, 8
-    shrl.ph           t8, s3, 8
-    addq.ph           t0, s0, t4
-    addq.ph           t1, s1, t6
-    addq.ph           t2, s2, t7
-    addq.ph           t3, s3, t8
-    shra_r.ph         t0, t0, 8
-    shra_r.ph         t1, t1, 8
+    precr_sra.ph.w    t1, t0, 0
+    precr_sra.ph.w    t3, t2, 0
+    precr.qb.ph       t0, t3, t1
+
+    muleu_s.ph.qbl    t2, t0, t8
+    muleu_s.ph.qbr    t3, t0, t8
+    shra_r.ph         t4, t2, 8
+    shra_r.ph         t5, t3, 8
+    and               t4, t4, t9
+    and               t5, t5, t9
+    addq.ph           t2, t2, t4
+    addq.ph           t3, t3, t5
     shra_r.ph         t2, t2, 8
     shra_r.ph         t3, t3, 8
-    shrl.ph           t4, s4, 8
-    shrl.ph           t6, s5, 8
-    shrl.ph           t7, s6, 8
-    shrl.ph           t8, s7, 8
-    addq.ph           s0, s4, t4
-    addq.ph           s1, s5, t6
-    addq.ph           s2, s6, t7
-    addq.ph           s3, s7, t8
-    shra_r.ph         t4, s0, 8
-    shra_r.ph         t6, s1, 8
-    shra_r.ph         t7, s2, 8
-    shra_r.ph         t8, s3, 8
-
-    precr.qb.ph       s0, t0, t1
-    precr.qb.ph       s1, t2, t3
-    precr.qb.ph       s2, t4, t6
-    precr.qb.ph       s3, t7, t8
+    precr.qb.ph       t2, t2, t3
 
-    sb                s0, 0(a0)
-    sb                s1, 1(a0)
-    sb                s2, 2(a0)
-    sb                s3, 3(a0)
-    bgtz              t9, 1b
+    sb                t2, 0(a0)
+    srl               t2, t2, 8
+    sb                t2, 1(a0)
+    srl               t2, t2, 8
+    sb                t2, 2(a0)
+    srl               t2, t2, 8
+    sb                t2, 3(a0)
+    addiu             a2, a2, -4
+    b                 0b
      addiu            a0, a0, 4
-2:
-    beqz              a2, 4f
+
+1:
+    beqz              a2, 3f
      nop
-3:
-    lbu               t1, 0(a0)
+    srl               t8, a1, 24
+2:
+    lbu               t0, 0(a0)
+
+    mul               t2, t0, t8
+    shra_r.ph         t3, t2, 8
+    andi              t3, t3, 0x00ff
+    addq.ph           t2, t2, t3
+    shra_r.ph         t2, t2, 8
 
-    muleu_s.ph.qbl    t4, t1, t5
-    muleu_s.ph.qbr    t7, t1, t5
-    shrl.ph           t6, t4, 8
-    shrl.ph           t0, t7, 8
-    addq.ph           t8, t4, t6
-    addq.ph           t9, t7, t0
-    shra_r.ph         t8, t8, 8
-    shra_r.ph         t9, t9, 8
-    precr.qb.ph       t2, t8, t9
     sb                t2, 0(a0)
     addiu             a2, a2, -1
-    bnez              a2, 3b
+    bnez              a2, 2b
      addiu            a0, a0, 1
-4:
-    RESTORE_REGS_FROM_STACK 20, s0, s1, s2, s3, s4, s5, s6, s7
-5:
+
+3:
     j                 ra
      nop
 
commit 5858f09d264ef762ddcf7ede324bfce9f5991d29
Author: Nemanja Lukic <nemanja.lukic at rt-rk.com>
Date:   Mon Apr 15 19:32:57 2013 +0200

    MIPS: DSPr2: Added src_0565_8888 nearest neighbor fast path.
    
    Performance numbers before/after on MIPS-74kc @ 1GHz:
    
    lowlevel-blt-bench results
    
    Referent (before):
             src_0565_8888 =  L1:  20.70  L2:  19.22  M: 12.50 ( 49.79%)  HT: 10.45  VT: 10.18  R:  9.99  RT:  5.31 (  31Kops/s)
    
    Optimized:
             src_0565_8888 =  L1:  62.98  L2:  53.44  M: 23.07 ( 91.87%)  HT: 19.85  VT: 19.15  R: 17.70  RT:  9.68 (  43Kops/s)

diff --git a/pixman/pixman-mips-dspr2-asm.S b/pixman/pixman-mips-dspr2-asm.S
index 3996756..b94e66f 100644
--- a/pixman/pixman-mips-dspr2-asm.S
+++ b/pixman/pixman-mips-dspr2-asm.S
@@ -3196,6 +3196,65 @@ LEAF_MIPS_DSPR2(pixman_scaled_nearest_scanline_8888_0565_OVER_asm_mips)
 
 END(pixman_scaled_nearest_scanline_8888_0565_OVER_asm_mips)
 
+LEAF_MIPS_DSPR2(pixman_scaled_nearest_scanline_0565_8888_SRC_asm_mips)
+/*
+ * a0     - dst (a8r8g8b8)
+ * a1     - src (r5g6b5)
+ * a2     - w
+ * a3     - vx
+ * 16(sp) - unit_x
+ */
+
+    SAVE_REGS_ON_STACK 0, v0
+    beqz     a2, 3f
+     nop
+
+    lw       v0, 16(sp) /* v0 = unit_x */
+    addiu    t1, a2, -1
+    beqz     t1, 2f
+     nop
+
+    li       t4, 0x07e007e0
+    li       t5, 0x001F001F
+1:
+    sra      t0, a3, 16 /* t0 = vx >> 16 */
+    sll      t0, t0, 1  /* t0 = t0 * 2 ((r5g6b5)) */
+    addu     t0, a1, t0
+    lhu      t0, 0(t0)  /* t0 = source ((r5g6b5)) */
+    addu     a3, a3, v0 /* a3 = vx + unit_x */
+    sra      t1, a3, 16 /* t1 = vx >> 16 */
+    sll      t1, t1, 1  /* t1 = t1 * 2 ((r5g6b5)) */
+    addu     t1, a1, t1
+    lhu      t1, 0(t1)  /* t1 = source ((r5g6b5)) */
+    addu     a3, a3, v0 /* a3 = vx + unit_x */
+    addiu    a2, a2, -2
+
+    CONVERT_2x0565_TO_2x8888 t0, t1, t2, t3, t4, t5, t6, t7, t8, t9
+
+    sw       t2, 0(a0)
+    sw       t3, 4(a0)
+
+    addiu    t2, a2, -1
+    bgtz     t2, 1b
+     addiu   a0, a0, 8
+2:
+    beqz     a2, 3f
+     nop
+    sra      t0, a3, 16 /* t0 = vx >> 16 */
+    sll      t0, t0, 1  /* t0 = t0 * 2 ((r5g6b5)) */
+    addu     t0, a1, t0
+    lhu      t0, 0(t0)  /* t0 = source ((r5g6b5)) */
+
+    CONVERT_1x0565_TO_1x8888 t0, t1, t2, t3
+
+    sw       t1, 0(a0)
+3:
+    RESTORE_REGS_FROM_STACK 0, v0
+    j        ra
+     nop
+
+END(pixman_scaled_nearest_scanline_0565_8888_SRC_asm_mips)
+
 LEAF_MIPS_DSPR2(pixman_scaled_nearest_scanline_8888_8_0565_OVER_asm_mips)
 /*
  * a0     - dst  (r5g6b5)
diff --git a/pixman/pixman-mips-dspr2-asm.h b/pixman/pixman-mips-dspr2-asm.h
index bc458b6..cab122d 100644
--- a/pixman/pixman-mips-dspr2-asm.h
+++ b/pixman/pixman-mips-dspr2-asm.h
@@ -354,17 +354,16 @@ LEAF_MIPS32R2(symbol)                                   \
                                 out1_565, out2_565,  \
                                 maskR, maskG, maskB, \
                                 scratch1, scratch2
-    precrq.ph.w       \scratch1, \in2_8888, \in1_8888
-    precr_sra.ph.w    \in2_8888, \in1_8888, 0
-    shll.ph           \scratch1, \scratch1, 8
-    srl               \in2_8888, \in2_8888, 3
-    and               \scratch2, \in2_8888, \maskB
-    and               \scratch1, \scratch1, \maskR
-    srl               \in2_8888, \in2_8888, 2
-    and               \out2_565, \in2_8888, \maskG
-    or                \out2_565, \out2_565, \scratch2
-    or                \out1_565, \out2_565, \scratch1
-    srl               \out2_565, \out1_565, 16
+    precr.qb.ph    \scratch1, \in2_8888, \in1_8888
+    precrq.qb.ph   \in2_8888, \in2_8888, \in1_8888
+    and            \out1_565, \scratch1, \maskR
+    shrl.ph        \scratch1, \scratch1, 3
+    shll.ph        \in2_8888, \in2_8888, 3
+    and            \scratch1, \scratch1, \maskB
+    or             \out1_565, \out1_565, \scratch1
+    and            \in2_8888, \in2_8888, \maskG
+    or             \out1_565, \out1_565, \in2_8888
+    srl            \out2_565, \out1_565, 16
 .endm
 
 /*
diff --git a/pixman/pixman-mips-dspr2.c b/pixman/pixman-mips-dspr2.c
index c227feb..1949921 100644
--- a/pixman/pixman-mips-dspr2.c
+++ b/pixman/pixman-mips-dspr2.c
@@ -125,6 +125,8 @@ PIXMAN_MIPS_BIND_SCALED_NEAREST_SRC_DST (8888_8888, OVER,
                                          uint32_t, uint32_t)
 PIXMAN_MIPS_BIND_SCALED_NEAREST_SRC_DST (8888_0565, OVER,
                                          uint32_t, uint16_t)
+PIXMAN_MIPS_BIND_SCALED_NEAREST_SRC_DST (0565_8888, SRC,
+                                         uint16_t, uint32_t)
 
 PIXMAN_MIPS_BIND_SCALED_BILINEAR_SRC_DST (0, 8888_8888, SRC,
                                           uint32_t, uint32_t)
@@ -370,6 +372,14 @@ static const pixman_fast_path_t mips_dspr2_fast_paths[] =
     PIXMAN_MIPS_SIMPLE_NEAREST_FAST_PATH (OVER, a8r8g8b8, r5g6b5, mips_8888_0565),
     PIXMAN_MIPS_SIMPLE_NEAREST_FAST_PATH (OVER, a8b8g8r8, b5g6r5, mips_8888_0565),
 
+    PIXMAN_MIPS_SIMPLE_NEAREST_FAST_PATH (SRC, b5g6r5, x8b8g8r8, mips_0565_8888),
+    PIXMAN_MIPS_SIMPLE_NEAREST_FAST_PATH (SRC, r5g6b5, x8r8g8b8, mips_0565_8888),
+    /* Note: NONE repeat is not supported yet */
+    SIMPLE_NEAREST_FAST_PATH_COVER (SRC, r5g6b5, a8r8g8b8, mips_0565_8888),
+    SIMPLE_NEAREST_FAST_PATH_COVER (SRC, b5g6r5, a8b8g8r8, mips_0565_8888),
+    SIMPLE_NEAREST_FAST_PATH_PAD (SRC, r5g6b5, a8r8g8b8, mips_0565_8888),
+    SIMPLE_NEAREST_FAST_PATH_PAD (SRC, b5g6r5, a8b8g8r8, mips_0565_8888),
+
     PIXMAN_MIPS_SIMPLE_NEAREST_A8_MASK_FAST_PATH (OVER, a8r8g8b8, r5g6b5, mips_8888_8_0565),
     PIXMAN_MIPS_SIMPLE_NEAREST_A8_MASK_FAST_PATH (OVER, a8b8g8r8, b5g6r5, mips_8888_8_0565),
 
commit 311d55b6d8e1ac3acaa12d1d7c3eefdcfdc70718
Author: Nemanja Lukic <nemanja.lukic at rt-rk.com>
Date:   Mon Apr 15 19:32:56 2013 +0200

    MIPS: DSPr2: Added over_8888_0565 nearest neighbor fast path.
    
    Performance numbers before/after on MIPS-74kc @ 1GHz:
    
    lowlevel-blt-bench results
    
    Referent (before):
            over_8888_0565 =  L1:  13.22  L2:  12.02  M:  9.77 ( 38.92%)  HT:  8.58  VT:  8.35  R:  8.38  RT:  5.78 (  35Kops/s)
    
    Optimized:
            over_8888_0565 =  L1:  26.20  L2:  22.97  M: 15.92 ( 63.40%)  HT: 13.33  VT: 13.13  R: 12.72  RT:  7.65 (  39Kops/s)

diff --git a/pixman/pixman-mips-dspr2-asm.S b/pixman/pixman-mips-dspr2-asm.S
index 4b8dc22..3996756 100644
--- a/pixman/pixman-mips-dspr2-asm.S
+++ b/pixman/pixman-mips-dspr2-asm.S
@@ -3130,6 +3130,72 @@ LEAF_MIPS_DSPR2(pixman_scaled_nearest_scanline_8888_8888_OVER_asm_mips)
 
 END(pixman_scaled_nearest_scanline_8888_8888_OVER_asm_mips)
 
+LEAF_MIPS_DSPR2(pixman_scaled_nearest_scanline_8888_0565_OVER_asm_mips)
+/*
+ * a0     - dst  (r5g6b5)
+ * a1     - src  (a8r8g8b8)
+ * a2     - w
+ * a3     - vx
+ * 16(sp) - unit_x
+ */
+
+    SAVE_REGS_ON_STACK 24, s0, s1, s2, s3, s4, v0, v1
+    lw       t8, 40(sp) /* t8 = unit_x */
+    li       t4, 0x00ff00ff
+    li       t5, 0xf800f800
+    li       t6, 0x07e007e0
+    li       t7, 0x001F001F
+    beqz     a2, 3f
+     nop
+    addiu    t1, a2, -1
+    beqz     t1, 2f
+     nop
+1:
+    sra      t0, a3, 16 /* t0 = vx >> 16 */
+    sll      t0, t0, 2  /* t0 = t0 * 4 (a8r8g8b8) */
+    addu     t0, a1, t0
+    lw       t0, 0(t0)  /* t0 = source      (a8r8g8b8) */
+    addu     a3, a3, t8 /* a3 = vx + unit_x */
+    sra      t1, a3, 16 /* t0 = vx >> 16 */
+    sll      t1, t1, 2  /* t0 = t0 * 4 (a8r8g8b8) */
+    addu     t1, a1, t1
+    lw       t1, 0(t1)  /* t1 = source      (a8r8g8b8) */
+    addu     a3, a3, t8 /* a3 = vx + unit_x */
+    lhu      t2, 0(a0)  /* t2 = destination (r5g6b5) */
+    lhu      t3, 2(a0)  /* t3 = destination (r5g6b5) */
+
+    CONVERT_2x0565_TO_2x8888 t2, t3, v0, v1, t6, t7, s0, s1, s2, s3
+    OVER_2x8888_2x8888       t0, t1, v0, v1, t2, t3, t4, t9, s0, s1, s2, s3, s4
+    CONVERT_2x8888_TO_2x0565 t2, t3, v0, v1, t5, t6, t7, t9, s2
+
+    sh       v0, 0(a0)
+    sh       v1, 2(a0)
+    addiu    a2, a2, -2
+    addiu    t1, a2, -1
+    bgtz     t1, 1b
+     addiu   a0, a0, 4
+2:
+    beqz     a2, 3f
+     nop
+    sra      t0, a3, 16 /* t0 = vx >> 16 */
+    sll      t0, t0, 2  /* t0 = t0 * 4 (a8r8g8b8) */
+    addu     t0, a1, t0
+    lw       t0, 0(t0)  /* t0 = source      (a8r8g8b8) */
+    lhu      t1, 0(a0)  /* t1 = destination (r5g6b5) */
+    addu     a3, a3, t8 /* a3 = vx + unit_x */
+
+    CONVERT_1x0565_TO_1x8888 t1, t2, t5, t6
+    OVER_8888_8888           t0, t2, t1, t4, t3, t5, t6, t7
+    CONVERT_1x8888_TO_1x0565 t1, t2, t5, t6
+
+    sh       t2, 0(a0)
+3:
+    RESTORE_REGS_FROM_STACK 24, s0, s1, s2, s3, s4, v0, v1
+    j        ra
+     nop
+
+END(pixman_scaled_nearest_scanline_8888_0565_OVER_asm_mips)
+
 LEAF_MIPS_DSPR2(pixman_scaled_nearest_scanline_8888_8_0565_OVER_asm_mips)
 /*
  * a0     - dst  (r5g6b5)
diff --git a/pixman/pixman-mips-dspr2.c b/pixman/pixman-mips-dspr2.c
index a68c86f..c227feb 100644
--- a/pixman/pixman-mips-dspr2.c
+++ b/pixman/pixman-mips-dspr2.c
@@ -123,6 +123,8 @@ PIXMAN_MIPS_BIND_FAST_PATH_SRC_MASK_DST (over_8888_8888_8888, uint32_t, 1,
 
 PIXMAN_MIPS_BIND_SCALED_NEAREST_SRC_DST (8888_8888, OVER,
                                          uint32_t, uint32_t)
+PIXMAN_MIPS_BIND_SCALED_NEAREST_SRC_DST (8888_0565, OVER,
+                                         uint32_t, uint16_t)
 
 PIXMAN_MIPS_BIND_SCALED_BILINEAR_SRC_DST (0, 8888_8888, SRC,
                                           uint32_t, uint32_t)
@@ -365,6 +367,9 @@ static const pixman_fast_path_t mips_dspr2_fast_paths[] =
     PIXMAN_MIPS_SIMPLE_NEAREST_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, mips_8888_8888),
     PIXMAN_MIPS_SIMPLE_NEAREST_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, mips_8888_8888),
 
+    PIXMAN_MIPS_SIMPLE_NEAREST_FAST_PATH (OVER, a8r8g8b8, r5g6b5, mips_8888_0565),
+    PIXMAN_MIPS_SIMPLE_NEAREST_FAST_PATH (OVER, a8b8g8r8, b5g6r5, mips_8888_0565),
+
     PIXMAN_MIPS_SIMPLE_NEAREST_A8_MASK_FAST_PATH (OVER, a8r8g8b8, r5g6b5, mips_8888_8_0565),
     PIXMAN_MIPS_SIMPLE_NEAREST_A8_MASK_FAST_PATH (OVER, a8b8g8r8, b5g6r5, mips_8888_8_0565),
 
commit bd487ee34c343142cbe451a2e04541d8aba0eaa7
Author: Nemanja Lukic <nemanja.lukic at rt-rk.com>
Date:   Mon Apr 15 19:32:55 2013 +0200

    MIPS: DSPr2: Added over_8888_8888 nearest neighbor fast path.
    
    Performance numbers before/after on MIPS-74kc @ 1GHz:
    
    lowlevel-blt-bench results
    
    Referent (before):
            over_8888_8888 =  L1:  19.47  L2:  16.30  M: 11.24 ( 59.69%)  HT:  9.54  VT:  9.29  R:  9.47  RT:  6.24 (  37Kops/s)
    
    Optimized:
            over_8888_8888 =  L1:  43.67  L2:  33.30  M: 16.32 ( 86.65%)  HT: 14.10  VT: 13.78  R: 12.96  RT:  7.85 (  39Kops/s)

diff --git a/pixman/pixman-mips-dspr2-asm.S b/pixman/pixman-mips-dspr2-asm.S
index fb612d9..4b8dc22 100644
--- a/pixman/pixman-mips-dspr2-asm.S
+++ b/pixman/pixman-mips-dspr2-asm.S
@@ -3069,6 +3069,67 @@ LEAF_MIPS_DSPR2(pixman_composite_in_n_8_asm_mips)
 
 END(pixman_composite_in_n_8_asm_mips)
 
+LEAF_MIPS_DSPR2(pixman_scaled_nearest_scanline_8888_8888_OVER_asm_mips)
+/*
+ * a0     - dst  (a8r8g8b8)
+ * a1     - src  (a8r8g8b8)
+ * a2     - w
+ * a3     - vx
+ * 16(sp) - unit_x
+ */
+
+    SAVE_REGS_ON_STACK 0, s0, s1, s2, s3
+    lw       t8, 16(sp) /* t8 = unit_x */
+    li       t6, 0x00ff00ff
+    beqz     a2, 3f
+     nop
+    addiu    t1, a2, -1
+    beqz     t1, 2f
+     nop
+1:
+    sra      t0, a3, 16 /* t0 = vx >> 16 */
+    sll      t0, t0, 2  /* t0 = t0 * 4 (a8r8g8b8) */
+    addu     t0, a1, t0
+    lw       t0, 0(t0)  /* t0 = source      (a8r8g8b8) */
+    addu     a3, a3, t8 /* a3 = vx + unit_x */
+
+    sra      t1, a3, 16 /* t0 = vx >> 16 */
+    sll      t1, t1, 2  /* t0 = t0 * 4 (a8r8g8b8) */
+    addu     t1, a1, t1
+    lw       t1, 0(t1)  /* t1 = source      (a8r8g8b8) */
+    addu     a3, a3, t8 /* a3 = vx + unit_x */
+
+    lw       t2, 0(a0)  /* t2 = destination (a8r8g8b8) */
+    lw       t3, 4(a0)  /* t3 = destination (a8r8g8b8) */
+
+    OVER_2x8888_2x8888 t0, t1, t2, t3, t4, t5, t6, t7, t9, s0, s1, s2, s3
+
+    sw       t4, 0(a0)
+    sw       t5, 4(a0)
+    addiu    a2, a2, -2
+    addiu    t1, a2, -1
+    bgtz     t1, 1b
+     addiu   a0, a0, 8
+2:
+    beqz     a2, 3f
+     nop
+    sra      t0, a3, 16 /* t0 = vx >> 16 */
+    sll      t0, t0, 2  /* t0 = t0 * 4 (a8r8g8b8) */
+    addu     t0, a1, t0
+    lw       t0, 0(t0)  /* t0 = source      (a8r8g8b8) */
+    lw       t1, 0(a0)  /* t1 = destination (a8r8g8b8) */
+    addu     a3, a3, t8 /* a3 = vx + unit_x */
+
+    OVER_8888_8888 t0, t1, t2, t6, t4, t5, t3, t7
+
+    sw       t2, 0(a0)
+3:
+    RESTORE_REGS_FROM_STACK 0, s0, s1, s2, s3
+    j        ra
+     nop
+
+END(pixman_scaled_nearest_scanline_8888_8888_OVER_asm_mips)
+
 LEAF_MIPS_DSPR2(pixman_scaled_nearest_scanline_8888_8_0565_OVER_asm_mips)
 /*
  * a0     - dst  (r5g6b5)
diff --git a/pixman/pixman-mips-dspr2-asm.h b/pixman/pixman-mips-dspr2-asm.h
index b330c0f..bc458b6 100644
--- a/pixman/pixman-mips-dspr2-asm.h
+++ b/pixman/pixman-mips-dspr2-asm.h
@@ -587,6 +587,36 @@ LEAF_MIPS32R2(symbol)                                   \
     addu_s.qb          \out_8888, \out_8888, \s_8888
 .endm
 
+/*
+ * OVER operation on two a8r8g8b8 source pixels (s1_8888 and s2_8888) and two
+ * a8r8g8b8 destination pixels (d1_8888 and d2_8888). It also requires maskLSR
+ * needed for rounding process. maskLSR must have following value:
+ *   li       maskLSR, 0x00ff00ff
+ */
+.macro OVER_2x8888_2x8888 s1_8888,   \
+                          s2_8888,   \
+                          d1_8888,   \
+                          d2_8888,   \
+                          out1_8888, \
+                          out2_8888, \
+                          maskLSR,   \
+                          scratch1, scratch2, scratch3, \
+                          scratch4, scratch5, scratch6
+    not                    \scratch1,  \s1_8888
+    srl                    \scratch1,  \scratch1,  24
+    not                    \scratch2,  \s2_8888
+    srl                    \scratch2,  \scratch2,  24
+    MIPS_2xUN8x4_MUL_2xUN8 \d1_8888,   \d2_8888, \
+                           \scratch1,  \scratch2,  \
+                           \out1_8888, \out2_8888, \
+                           \maskLSR, \
+                           \scratch3,  \scratch4, \scratch5, \
+                           \scratch6,  \d1_8888,  \d2_8888
+
+    addu_s.qb              \out1_8888, \out1_8888, \s1_8888
+    addu_s.qb              \out2_8888, \out2_8888, \s2_8888
+.endm
+
 .macro MIPS_UN8x4_MUL_UN8_ADD_UN8x4 s_8888,   \
                                     m_8,      \
                                     d_8888,   \
diff --git a/pixman/pixman-mips-dspr2.c b/pixman/pixman-mips-dspr2.c
index 1ea2445..a68c86f 100644
--- a/pixman/pixman-mips-dspr2.c
+++ b/pixman/pixman-mips-dspr2.c
@@ -121,6 +121,9 @@ PIXMAN_MIPS_BIND_FAST_PATH_SRC_MASK_DST (over_0565_8_0565, uint16_t, 1,
 PIXMAN_MIPS_BIND_FAST_PATH_SRC_MASK_DST (over_8888_8888_8888, uint32_t, 1,
                                          uint32_t, 1, uint32_t, 1)
 
+PIXMAN_MIPS_BIND_SCALED_NEAREST_SRC_DST (8888_8888, OVER,
+                                         uint32_t, uint32_t)
+
 PIXMAN_MIPS_BIND_SCALED_BILINEAR_SRC_DST (0, 8888_8888, SRC,
                                           uint32_t, uint32_t)
 PIXMAN_MIPS_BIND_SCALED_BILINEAR_SRC_DST (0, 8888_0565, SRC,
@@ -357,6 +360,11 @@ static const pixman_fast_path_t mips_dspr2_fast_paths[] =
     PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8b8g8r8, mips_composite_over_reverse_n_8888),
     PIXMAN_STD_FAST_PATH (IN,           solid, null, a8,       mips_composite_in_n_8),
 
+    PIXMAN_MIPS_SIMPLE_NEAREST_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, mips_8888_8888),
+    PIXMAN_MIPS_SIMPLE_NEAREST_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, mips_8888_8888),
+    PIXMAN_MIPS_SIMPLE_NEAREST_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, mips_8888_8888),
+    PIXMAN_MIPS_SIMPLE_NEAREST_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, mips_8888_8888),
+
     PIXMAN_MIPS_SIMPLE_NEAREST_A8_MASK_FAST_PATH (OVER, a8r8g8b8, r5g6b5, mips_8888_8_0565),
     PIXMAN_MIPS_SIMPLE_NEAREST_A8_MASK_FAST_PATH (OVER, a8b8g8r8, b5g6r5, mips_8888_8_0565),
 
diff --git a/pixman/pixman-mips-dspr2.h b/pixman/pixman-mips-dspr2.h
index 4ac9ff9..955ed70 100644
--- a/pixman/pixman-mips-dspr2.h
+++ b/pixman/pixman-mips-dspr2.h
@@ -246,6 +246,48 @@ mips_composite_##name (pixman_implementation_t *imp,                     \
     }                                                                    \
 }
 
+/****************************************************************************/
+
+#define PIXMAN_MIPS_BIND_SCALED_NEAREST_SRC_DST(name, op,                    \
+                                                src_type, dst_type)          \
+void                                                                         \
+pixman_scaled_nearest_scanline_##name##_##op##_asm_mips (                    \
+                                                   dst_type *       dst,     \
+                                                   const src_type * src,     \
+                                                   int32_t          w,       \
+                                                   pixman_fixed_t   vx,      \
+                                                   pixman_fixed_t   unit_x); \
+                                                                             \
+static force_inline void                                                     \
+scaled_nearest_scanline_mips_##name##_##op (dst_type *       pd,             \
+                                            const src_type * ps,             \
+                                            int32_t          w,              \
+                                            pixman_fixed_t   vx,             \
+                                            pixman_fixed_t   unit_x,         \
+                                            pixman_fixed_t   max_vx,         \
+                                            pixman_bool_t    zero_src)       \
+{                                                                            \
+    pixman_scaled_nearest_scanline_##name##_##op##_asm_mips (pd, ps, w,      \
+                                                             vx, unit_x);    \
+}                                                                            \
+                                                                             \
+FAST_NEAREST_MAINLOOP (mips_##name##_cover_##op,                             \
+                       scaled_nearest_scanline_mips_##name##_##op,           \
+                       src_type, dst_type, COVER)                            \
+FAST_NEAREST_MAINLOOP (mips_##name##_none_##op,                              \
+                       scaled_nearest_scanline_mips_##name##_##op,           \
+                       src_type, dst_type, NONE)                             \
+FAST_NEAREST_MAINLOOP (mips_##name##_pad_##op,                               \
+                       scaled_nearest_scanline_mips_##name##_##op,           \
+                       src_type, dst_type, PAD)
+
+/* Provide entries for the fast path table */
+#define PIXMAN_MIPS_SIMPLE_NEAREST_FAST_PATH(op,s,d,func)                    \
+    SIMPLE_NEAREST_FAST_PATH_COVER (op,s,d,func),                            \
+    SIMPLE_NEAREST_FAST_PATH_NONE (op,s,d,func),                             \
+    SIMPLE_NEAREST_FAST_PATH_PAD (op,s,d,func)
+
+
 /*****************************************************************************/
 
 #define PIXMAN_MIPS_BIND_SCALED_NEAREST_SRC_A8_DST(flags, name, op,           \
commit 66def909ad82ed4ccb49380031cb828655c9a47f
Author: Nemanja Lukic <nemanja.lukic at rt-rk.com>
Date:   Mon Apr 15 19:32:54 2013 +0200

    MIPS: DSPr2: Fix bug in over_n_8888_8888_ca/over_n_8888_0565_ca routines
    
    After introducing new PRNG (pseudorandom number generator) a bug in two DSPr2
    routines was revealed. Bug manifested by wrong calculation in composite and
    glyph tests, which caused make check to fail for MIPS DSPr2 optimizations.
    
    Bug was in the calculation of the:
    *dst = over (src, *dst) when ma == 0xffffffff
    
    In this case src was not negated and shifted right by 24 bits, it was only
    negated. When implementing this routine in the first place, I missplaced those
    shifts, which alowed me to combine code for over operation and:
        UN8x4_MUL_UN8x4 (s, ma);
        UN8x4_MUL_UN8 (ma, srca);
        ma = ~ma;
        UN8x4_MUL_UN8x4_ADD_UN8x4 (d, ma, s);
    So I decided to rewrite that piece of code from scratch. I changed logic, so
    now assembly code mimics code from pixman-fast-path.c but processes two pixels
    at a time. This code should be easier to debug and maintain.
    
    The bug was revealed in commit b31a6962. Errors were detected by composite
    and glyph tests.

diff --git a/pixman/pixman-mips-dspr2-asm.S b/pixman/pixman-mips-dspr2-asm.S
index 3adbb2a..fb612d9 100644
--- a/pixman/pixman-mips-dspr2-asm.S
+++ b/pixman/pixman-mips-dspr2-asm.S
@@ -840,34 +840,35 @@ LEAF_MIPS_DSPR2(pixman_composite_over_n_8888_8888_ca_asm_mips)
  * a3 - w
  */
 
-    SAVE_REGS_ON_STACK 8, s0, s1, s2, s3, s4, s5
-    beqz         a3, 4f
+    beqz         a3, 8f
      nop
+    SAVE_REGS_ON_STACK 8, s0, s1, s2, s3, s4, s5
+
     li           t6, 0xff
     addiu        t7, zero, -1 /* t7 = 0xffffffff */
     srl          t8, a1, 24   /* t8 = srca */
     li           t9, 0x00ff00ff
+
     addiu        t1, a3, -1
-    beqz         t1, 3f       /* last pixel */
+    beqz         t1, 4f       /* last pixel */
      nop
-    beq          t8, t6, 2f   /* if (srca == 0xff) */
-     nop
-1:
-                              /* a1 = src */
+
+0:
     lw           t0, 0(a2)    /* t0 = mask */
     lw           t1, 4(a2)    /* t1 = mask */
+    addiu        a3, a3, -2   /* w = w - 2 */
     or           t2, t0, t1
-    beqz         t2, 12f      /* if (t0 == 0) && (t1 == 0) */
+    beqz         t2, 3f      /* if (t0 == 0) && (t1 == 0) */
      addiu       a2, a2, 8
-    and          t3, t0, t1
-    move         t4, a1       /* t4 = src */
-    move         t5, a1       /* t5 = src */
+    and          t2, t0, t1
+    beq          t2, t7, 1f  /* if (t0 == 0xffffffff) && (t1 == 0xffffffff) */
+     nop
+
+//if(ma)
     lw           t2, 0(a0)    /* t2 = dst */
-    beq          t3, t7, 11f  /* if (t0 == 0xffffffff) && (t1 == 0xffffffff) */
-     lw          t3, 4(a0)    /* t3 = dst */
+    lw           t3, 4(a0)    /* t3 = dst */
     MIPS_2xUN8x4_MUL_2xUN8x4 a1, a1, t0, t1, t4, t5, t9, s0, s1, s2, s3, s4, s5
     MIPS_2xUN8x4_MUL_2xUN8   t0, t1, t8, t8, t0, t1, t9, s0, s1, s2, s3, s4, s5
-11:
     not          t0, t0
     not          t1, t1
     MIPS_2xUN8x4_MUL_2xUN8x4 t2, t3, t0, t1, t2, t3, t9, s0, s1, s2, s3, s4, s5
@@ -875,62 +876,79 @@ LEAF_MIPS_DSPR2(pixman_composite_over_n_8888_8888_ca_asm_mips)
     addu_s.qb    t3, t5, t3
     sw           t2, 0(a0)
     sw           t3, 4(a0)
-12:
-    addiu        a3, a3, -2
     addiu        t1, a3, -1
-    bgtz         t1, 1b
+    bgtz         t1, 0b
      addiu       a0, a0, 8
-    b            3f
+    b            4f
+     nop
+1:
+//if (t0 == 0xffffffff) && (t1 == 0xffffffff):
+    beq          t8, t6, 2f   /* if (srca == 0xff) */
      nop
-2:
-                              /* a1 = src */
-    lw           t0, 0(a2)    /* t0 = mask */
-    lw           t1, 4(a2)    /* t1 = mask */
-    or           t2, t0, t1
-    beqz         t2, 22f      /* if (t0 == 0) & (t1 == 0) */
-     addiu       a2, a2, 8
-    and          t2, t0, t1
-    move         t4, a1
-    beq          t2, t7, 21f  /* if (t0 == 0xffffffff) && (t1 == 0xffffffff) */
-     move        t5, a1
     lw           t2, 0(a0)    /* t2 = dst */
     lw           t3, 4(a0)    /* t3 = dst */
-    MIPS_2xUN8x4_MUL_2xUN8x4 a1, a1, t0, t1, t4, t5, t9, s0, s1, s2, s3, s4, s5
-    not          t0, t0
-    not          t1, t1
-    MIPS_2xUN8x4_MUL_2xUN8x4 t2, t3, t0, t1, t2, t3, t9, s0, s1, s2, s3, s4, s5
-    addu_s.qb    t4, t4, t2
-    addu_s.qb    t5, t5, t3
-21:
-    sw           t4, 0(a0)
-    sw           t5, 4(a0)
-22:
-    addiu        a3, a3, -2
+    not          t0, a1
+    not          t1, a1
+    srl          t0, t0, 24
+    srl          t1, t1, 24
+    MIPS_2xUN8x4_MUL_2xUN8 t2, t3, t0, t1, t2, t3, t9, s0, s1, s2, s3, s4, s5
+    addu_s.qb    t2, a1, t2
+    addu_s.qb    t3, a1, t3
+    sw           t2, 0(a0)
+    sw           t3, 4(a0)
     addiu        t1, a3, -1
-    bgtz         t1, 2b
+    bgtz         t1, 0b
      addiu       a0, a0, 8
+    b            4f
+     nop
+2:
+    sw           a1, 0(a0)
+    sw           a1, 4(a0)
 3:
-    blez         a3, 4f
+    addiu        t1, a3, -1
+    bgtz         t1, 0b
+     addiu       a0, a0, 8
+
+4:
+    beqz         a3, 7f
      nop
                               /* a1 = src */
-    lw           t1, 0(a2)    /* t1 = mask */
-    beqz         t1, 4f
+    lw           t0, 0(a2)    /* t0 = mask */
+    beqz         t0, 7f       /* if (t0 == 0) */
      nop
-    move         t2, a1       /* t2 = src */
-    beq          t1, t7, 31f
-     lw          t0, 0(a0)    /* t0 = dst */
-
-    MIPS_UN8x4_MUL_UN8x4  a1, t1, t2, t9, t3, t4, t5, t6
-    MIPS_UN8x4_MUL_UN8    t1, t8, t1, t9, t3, t4, t5
-31:
-    not          t1, t1
-    MIPS_UN8x4_MUL_UN8x4  t0, t1, t0, t9, t3, t4, t5, t6
-    addu_s.qb    t0, t2, t0
-    sw           t0, 0(a0)
-4:
+    beq          t0, t7, 5f  /* if (t0 == 0xffffffff) */
+     nop
+//if(ma)
+    lw           t1, 0(a0)    /* t1 = dst */
+    MIPS_UN8x4_MUL_UN8x4  a1, t0, t2, t9, t3, t4, t5, s0
+    MIPS_UN8x4_MUL_UN8    t0, t8, t0, t9, t3, t4, t5
+    not          t0, t0
+    MIPS_UN8x4_MUL_UN8x4  t1, t0, t1, t9, t3, t4, t5, s0
+    addu_s.qb    t1, t2, t1
+    sw           t1, 0(a0)
     RESTORE_REGS_FROM_STACK 8, s0, s1, s2, s3, s4, s5
     j            ra
      nop
+5:
+//if (t0 == 0xffffffff)
+    beq          t8, t6, 6f   /* if (srca == 0xff) */
+     nop
+    lw           t1, 0(a0)    /* t1 = dst */
+    not          t0, a1
+    srl          t0, t0, 24
+    MIPS_UN8x4_MUL_UN8 t1, t0, t1, t9, t2, t3, t4
+    addu_s.qb    t1, a1, t1
+    sw           t1, 0(a0)
+    RESTORE_REGS_FROM_STACK 8, s0, s1, s2, s3, s4, s5
+    j            ra
+     nop
+6:
+    sw           a1, 0(a0)
+7:
+    RESTORE_REGS_FROM_STACK 8, s0, s1, s2, s3, s4, s5
+8:
+    j            ra
+     nop
 
 END(pixman_composite_over_n_8888_8888_ca_asm_mips)
 
@@ -942,106 +960,126 @@ LEAF_MIPS_DSPR2(pixman_composite_over_n_8888_0565_ca_asm_mips)
  * a3 - w
  */
 
-    SAVE_REGS_ON_STACK 20, s0, s1, s2, s3, s4, s5, s6, s7, s8
-    beqz         a3, 4f
+    beqz         a3, 8f
      nop
-    li           t5, 0xf800f800
-    li           t6, 0x07e007e0
-    li           t7, 0x001F001F
-    li           t9, 0x00ff00ff
+    SAVE_REGS_ON_STACK 20, s0, s1, s2, s3, s4, s5, s6, s7, s8
 
+    li           t6, 0xff
+    addiu        t7, zero, -1 /* t7 = 0xffffffff */
     srl          t8, a1, 24   /* t8 = srca */
+    li           t9, 0x00ff00ff
+    li           s6, 0xf800f800
+    li           s7, 0x07e007e0
+    li           s8, 0x001F001F
+
     addiu        t1, a3, -1
-    beqz         t1, 3f       /* last pixel */
+    beqz         t1, 4f       /* last pixel */
      nop
-    li           s0, 0xff     /* s0 = 0xff */
-    addiu        s1, zero, -1 /* s1 = 0xffffffff */
 
-    beq          t8, s0, 2f   /* if (srca == 0xff) */
-     nop
-1:
-                              /* a1 = src */
+0:
     lw           t0, 0(a2)    /* t0 = mask */
     lw           t1, 4(a2)    /* t1 = mask */
+    addiu        a3, a3, -2   /* w = w - 2 */
     or           t2, t0, t1
-    beqz         t2, 12f      /* if (t0 == 0) && (t1 == 0) */
+    beqz         t2, 3f      /* if (t0 == 0) && (t1 == 0) */
      addiu       a2, a2, 8
-    and          t3, t0, t1
-    move         s2, a1       /* s2 = src */
-    move         s3, a1       /* s3 = src */
+    and          t2, t0, t1
+    beq          t2, t7, 1f  /* if (t0 == 0xffffffff) && (t1 == 0xffffffff) */
+     nop
+
+//if(ma)
     lhu          t2, 0(a0)    /* t2 = dst */
-    beq          t3, s1, 11f  /* if (t0 == 0xffffffff) && (t1 == 0xffffffff) */
-     lhu         t3, 2(a0)    /* t3 = dst */
-    MIPS_2xUN8x4_MUL_2xUN8x4 a1, a1, t0, t1, s2, s3, t9, t4, s4, s5, s6, s7, s8
-    MIPS_2xUN8x4_MUL_2xUN8   t0, t1, t8, t8, t0, t1, t9, t4, s4, s5, s6, s7, s8
-11:
+    lhu          t3, 2(a0)    /* t3 = dst */
+    MIPS_2xUN8x4_MUL_2xUN8x4 a1, a1, t0, t1, t4, t5, t9, s0, s1, s2, s3, s4, s5
+    MIPS_2xUN8x4_MUL_2xUN8   t0, t1, t8, t8, t0, t1, t9, s0, s1, s2, s3, s4, s5
     not          t0, t0
     not          t1, t1
-    CONVERT_2x0565_TO_2x8888 t2, t3, s4, s5, t6, t7, t4, s6, s7, s8
-    MIPS_2xUN8x4_MUL_2xUN8x4 s4, s5, t0, t1, s4, s5, t9, t4, s6, s7, s8, t0, t1
-    addu_s.qb    s2, s2, s4
-    addu_s.qb    s3, s3, s5
-    CONVERT_2x8888_TO_2x0565 s2, s3, t2, t3, t5, t6, t7, s4, s5
+    CONVERT_2x0565_TO_2x8888 t2, t3, t2, t3, s7, s8, s0, s1, s2, s3
+    MIPS_2xUN8x4_MUL_2xUN8x4 t2, t3, t0, t1, t2, t3, t9, s0, s1, s2, s3, s4, s5
+    addu_s.qb    t2, t4, t2
+    addu_s.qb    t3, t5, t3
+    CONVERT_2x8888_TO_2x0565 t2, t3, t2, t3, s6, s7, s8, s0, s1
     sh           t2, 0(a0)
     sh           t3, 2(a0)
-12:
-    addiu        a3, a3, -2
     addiu        t1, a3, -1
-    bgtz         t1, 1b
+    bgtz         t1, 0b
      addiu       a0, a0, 4
-    b            3f
+    b            4f
+     nop
+1:
+//if (t0 == 0xffffffff) && (t1 == 0xffffffff):
+    beq          t8, t6, 2f   /* if (srca == 0xff) */
      nop
-2:
-                              /* a1 = src */
-    lw           t0, 0(a2)    /* t0 = mask */
-    lw           t1, 4(a2)    /* t1 = mask */
-    or           t2, t0, t1
-    beqz         t2, 22f      /* if (t0 == 0) & (t1 == 0) */
-     addiu       a2, a2, 8
-    and          t3, t0, t1
-    move         t2, a1
-    beq          t3, s1, 21f  /* if (t0 == 0xffffffff) && (t1 == 0xffffffff) */
-     move        t3, a1
     lhu          t2, 0(a0)    /* t2 = dst */
     lhu          t3, 2(a0)    /* t3 = dst */
-    MIPS_2xUN8x4_MUL_2xUN8x4 a1, a1, t0, t1, s2, s3, t9, t4, s4, s5, s6, s7, s8
-    not          t0, t0
-    not          t1, t1
-    CONVERT_2x0565_TO_2x8888 t2, t3, s4, s5, t6, t7, t4, s6, s7, s8
-    MIPS_2xUN8x4_MUL_2xUN8x4 s4, s5, t0, t1, s4, s5, t9, t4, s6, s7, s8, t2, t3
-    addu_s.qb    t2, s2, s4
-    addu_s.qb    t3, s3, s5
-21:
-    CONVERT_2x8888_TO_2x0565 t2, t3, t0, t1, t5, t6, t7, s2, s3
-    sh           t0, 0(a0)
-    sh           t1, 2(a0)
-22:
-    addiu        a3, a3, -2
+    not          t0, a1
+    not          t1, a1
+    srl          t0, t0, 24
+    srl          t1, t1, 24
+    CONVERT_2x0565_TO_2x8888 t2, t3, t2, t3, s7, s8, s0, s1, s2, s3
+    MIPS_2xUN8x4_MUL_2xUN8   t2, t3, t0, t1, t2, t3, t9, s0, s1, s2, s3, s4, s5
+    addu_s.qb    t2, a1, t2
+    addu_s.qb    t3, a1, t3
+    CONVERT_2x8888_TO_2x0565 t2, t3, t2, t3, s6, s7, s8, s0, s1
+    sh           t2, 0(a0)
+    sh           t3, 2(a0)
     addiu        t1, a3, -1
-    bgtz         t1, 2b
+    bgtz         t1, 0b
      addiu       a0, a0, 4
+    b            4f
+     nop
+2:
+    CONVERT_1x8888_TO_1x0565 a1, t2, s0, s1
+    sh           t2, 0(a0)
+    sh           t2, 2(a0)
 3:
-    blez         a3, 4f
+    addiu        t1, a3, -1
+    bgtz         t1, 0b
+     addiu       a0, a0, 4
+
+4:
+    beqz         a3, 7f
      nop
                               /* a1 = src */
-    lw           t1, 0(a2)    /* t1 = mask */
-    beqz         t1, 4f
+    lw           t0, 0(a2)    /* t0 = mask */
+    beqz         t0, 7f       /* if (t0 == 0) */
      nop
-    move         t2, a1       /* t2 = src */
-    beq          t1, t7, 31f
-     lhu         t0, 0(a0)    /* t0 = dst */
-
-    MIPS_UN8x4_MUL_UN8x4     a1, t1, t2, t9, t3, t4, t5, t6
-    MIPS_UN8x4_MUL_UN8       t1, t8, t1, t9, t3, t4, t5
-31:
-    not          t1, t1
-    CONVERT_1x0565_TO_1x8888 t0, s1, s2, s3
-    MIPS_UN8x4_MUL_UN8x4     s1, t1, t3, t9, t4, t5, t6, t7
-    addu_s.qb    t0, t2, t3
-    CONVERT_1x8888_TO_1x0565 t0, s1, s2, s3
-    sh           s1, 0(a0)
-4:
-    RESTORE_REGS_FROM_STACK  20, s0, s1, s2, s3, s4, s5, s6, s7, s8
+    beq          t0, t7, 5f  /* if (t0 == 0xffffffff) */
+     nop
+//if(ma)
+    lhu          t1, 0(a0)    /* t1 = dst */
+    MIPS_UN8x4_MUL_UN8x4     a1, t0, t2, t9, t3, t4, t5, s0
+    MIPS_UN8x4_MUL_UN8       t0, t8, t0, t9, t3, t4, t5
+    not          t0, t0
+    CONVERT_1x0565_TO_1x8888 t1, s1, s2, s3
+    MIPS_UN8x4_MUL_UN8x4     s1, t0, s1, t9, t3, t4, t5, s0
+    addu_s.qb    s1, t2, s1
+    CONVERT_1x8888_TO_1x0565 s1, t1, s0, s2
+    sh           t1, 0(a0)
+    RESTORE_REGS_FROM_STACK 20, s0, s1, s2, s3, s4, s5, s6, s7, s8
+    j            ra
+     nop
+5:
+//if (t0 == 0xffffffff)
+    beq          t8, t6, 6f   /* if (srca == 0xff) */
+     nop
+    lhu          t1, 0(a0)    /* t1 = dst */
+    not          t0, a1
+    srl          t0, t0, 24
+    CONVERT_1x0565_TO_1x8888 t1, s1, s2, s3
+    MIPS_UN8x4_MUL_UN8       s1, t0, s1, t9, t2, t3, t4
+    addu_s.qb    s1, a1, s1
+    CONVERT_1x8888_TO_1x0565 s1, t1, s0, s2
+    sh           t1, 0(a0)
+    RESTORE_REGS_FROM_STACK 20, s0, s1, s2, s3, s4, s5, s6, s7, s8
+    j            ra
+     nop
+6:
+    CONVERT_1x8888_TO_1x0565 a1, t1, s0, s2
+    sh           t1, 0(a0)
+7:
+    RESTORE_REGS_FROM_STACK 20, s0, s1, s2, s3, s4, s5, s6, s7, s8
+8:
     j            ra
      nop