[Mesa-dev] [PATCH 4/4] RFC: nir: add lowering for idiv/udiv/umod

Wed Apr 1 04:09:13 PDT 2015

Am 01.04.2015 um 03:44 schrieb Rob Clark:
> On Tue, Mar 31, 2015 at 9:03 PM, Roland Scheidegger <sroland at vmware.com> wrote:
>> Am 01.04.2015 um 00:57 schrieb Rob Clark:
>>> From: Rob Clark <robclark at freedesktop.org>
>>>
>>> Based on the algo from NV50LegalizeSSA::handleDIV() and handleMOD().
>>> See also trans_idiv() in freedreno/ir3/ir3_compiler.c (which was an
>>> adaptation of the nv50 code from Ilia).
>>>
>>> Just sending as an rfc right now, since I'm not quite at the point to be
>>> able to test it on actual hw.
>>>
>>> Signed-off-by: Rob Clark <robclark at freedesktop.org>
>>> ---
>>>  src/glsl/Makefile.sources     |   1 +
>>>  src/glsl/nir/nir.h            |   1 +
>>>  src/glsl/nir/nir_lower_idiv.c | 212 ++++++++++++++++++++++++++++++++++++++++++
>>>  3 files changed, 214 insertions(+)
>>>  create mode 100644 src/glsl/nir/nir_lower_idiv.c
>>>
>>> diff --git a/src/glsl/Makefile.sources b/src/glsl/Makefile.sources
>>> index 18fff38..e426970 100644
>>> --- a/src/glsl/Makefile.sources
>>> +++ b/src/glsl/Makefile.sources
>>> @@ -32,6 +32,7 @@ NIR_FILES = \
>>>       nir/nir_lower_atomics.c \
>>>       nir/nir_lower_global_vars_to_local.c \
>>>       nir/nir_lower_locals_to_regs.c \
>>> +     nir/nir_lower_idiv.c \
>>>       nir/nir_lower_io.c \
>>>       nir/nir_lower_phis_to_scalar.c \
>>>       nir/nir_lower_samplers.cpp \
>>> diff --git a/src/glsl/nir/nir.h b/src/glsl/nir/nir.h
>>> index cd03d6b..e002d6f 100644
>>> --- a/src/glsl/nir/nir.h
>>> +++ b/src/glsl/nir/nir.h
>>> @@ -1605,6 +1605,7 @@ void nir_lower_samplers(nir_shader *shader,
>>>
>>>  void nir_lower_system_values(nir_shader *shader);
>>>  void nir_lower_tex_projector(nir_shader *shader);
>>> +void nir_lower_idiv(nir_shader *shader);
>>>
>>>  void nir_lower_atomics(nir_shader *shader);
>>>  void nir_lower_to_source_mods(nir_shader *shader);
>>> diff --git a/src/glsl/nir/nir_lower_idiv.c b/src/glsl/nir/nir_lower_idiv.c
>>> new file mode 100644
>>> index 0000000..e95c57e
>>> --- /dev/null
>>> +++ b/src/glsl/nir/nir_lower_idiv.c
>>> @@ -0,0 +1,212 @@
>>> +/*
>>> + * Copyright © 2015 Red Hat
>>> + *
>>> + * Permission is hereby granted, free of charge, to any person obtaining a
>>> + * copy of this software and associated documentation files (the "Software"),
>>> + * to deal in the Software without restriction, including without limitation
>>> + * the rights to use, copy, modify, merge, publish, distribute, sublicense,
>>> + * and/or sell copies of the Software, and to permit persons to whom the
>>> + * Software is furnished to do so, subject to the following conditions:
>>> + *
>>> + * The above copyright notice and this permission notice (including the next
>>> + * paragraph) shall be included in all copies or substantial portions of the
>>> + * Software.
>>> + *
>>> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
>>> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
>>> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
>>> + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
>>> + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
>>> + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
>>> + * IN THE SOFTWARE.
>>> + *
>>> + * Authors:
>>> + *    Rob Clark <robclark at freedesktop.org>
>>> + */
>>> +
>>> +#include "nir.h"
>>> +#include "nir_builder.h"
>>> +
>>> +/* Lowers idiv/udiv/umod
>>> + * Based on NV50LegalizeSSA::handleDIV()
>>> + *
>>> + * Note that this is probably not enough precision for compute shaders.
>>> + * Perhaps we want a second higher precision (looping) version of this?
>>> + * Or perhaps we assume if you can do compute shaders you can also
>>> + * branch out to a pre-optimized shader library routine..
>>
>> So if this is not enough precision, maybe should state how large the
>> error can be?
>>
> 
> tbh, if I knew what the error for this approach was, I would have
> included it.  I'm not the original author, but this is based on
> nouveau codegen code (as mentioned in the comment).  I guess it is
> better than converting to float and dividing and converting back, but
> worse than an iterative (ie. looping, ie. divergent flow control)
> approach.  It is apparently enough to keep piglit happy.
> 
> The original algo in nv50 lowering code is from
> 322bc7ed68ed92233c97168c036d0aa50c11a20e (ie. 'nv50/ir: import nv50
> target') which doesn't really give more clue about the origin..
> 
> if anyone knows, I'm all ears and will add relevant links/info to comment..

Ah ok. Well it isn't even obvious to me if the results are not actually
always exact.

Roland

> 
> BR,
> -R
> 
>> Roland
>>
>>> + *
>>> + *   vec4 af, bf, a, b, q, r;
>>> + *
>>> + *   if (op == idiv) {
>>> + *      af = i2f numerator
>>> + *      bf = i2f denominator
>>> + *      af = fabs af
>>> + *      bf = fabs bf
>>> + *      a  = iabs numerator
>>> + *      b  = iabs denominator
>>> + *   } else {
>>> + *      af = u2f numerator
>>> + *      bf = u2f denominator
>>> + *      a  = numerator
>>> + *      b  = denominator
>>> + *   }
>>> + *
>>> + *   ; get first result
>>> + *   bf = frcp bf
>>> + *   bf = iadd bf, -2    ; yes, subtract 2 as integer from float
>>> + *   q  = fmul af, bf
>>> + *
>>> + *   if (op == idiv) {
>>> + *      q = f2i q
>>> + *   } else {
>>> + *      q = f2u q
>>> + *   }
>>> + *
>>> + *   ; get error of first result:
>>> + *   r = imul q, b
>>> + *   r = isub a, r
>>> + *   r = u2f r
>>> + *
>>> + *   r = fmul r, bf
>>> + *   r = f2u r
>>> + *   q = iadd q, r
>>> + *
>>> + *   ; correction: if modulus >= divisor, add 1
>>> + *   t = imul q, b
>>> + *   r = isub a, t
>>> + *
>>> + *   r = fge r, b
>>> + *   r = b2i r
>>> + *
>>> + *   q = isub q, r
>>> + *   if (op == idiv) {
>>> + *      r = ixor a, b
>>> + *      r = ushr r, 31
>>> + *      r = i2b r
>>> + *      b = ineg q
>>> + *      q = bcsel r, b, q
>>> + *   }
>>> + *
>>> + *   if (op == umod) {
>>> + *      ; division result in q
>>> + *      r = imul q, b
>>> + *      q = isub a, r
>>> + *   }
>>> + *
>>> + *   dst = q
>>> + */
>>> +
>>> +static void
>>> +convert_instr(nir_builder *bld, nir_alu_instr *alu)
>>> +{
>>> +   nir_ssa_def *numer, *denom, *af, *bf, *a, *b, *q, *r;
>>> +   nir_op op = alu->op;
>>> +
>>> +   if ((op != nir_op_idiv) &&
>>> +       (op != nir_op_udiv) &&
>>> +       (op != nir_op_umod))
>>> +      return;
>>> +
>>> +   nir_builder_insert_before_instr(bld, &alu->instr);
>>> +
>>> +   numer = nir_ssa_for_src(bld, alu->src[0].src,
>>> +                           nir_ssa_alu_instr_src_components(alu, 0));
>>> +   denom = nir_ssa_for_src(bld, alu->src[1].src,
>>> +                           nir_ssa_alu_instr_src_components(alu, 1));
>>> +
>>> +   if (op == nir_op_idiv) {
>>> +      af = nir_i2f(bld, numer);
>>> +      bf = nir_i2f(bld, denom);
>>> +      af = nir_fabs(bld, af);
>>> +      bf = nir_fabs(bld, bf);
>>> +      a  = nir_iabs(bld, numer);
>>> +      b  = nir_iabs(bld, denom);
>>> +   } else {
>>> +      af = nir_u2f(bld, numer);
>>> +      bf = nir_u2f(bld, denom);
>>> +      a  = numer;
>>> +      b  = denom;
>>> +   }
>>> +
>>> +   /* get first result: */
>>> +   bf = nir_frcp(bld, bf);
>>> +   bf = nir_iadd(bld, bf, nir_imm_int(bld, -2));  /* yes, sub 2 as int from float */
>>> +   q  = nir_fmul(bld, af, bf);
>>> +
>>> +   if (op == nir_op_idiv) {
>>> +      q = nir_f2i(bld, q);
>>> +   } else {
>>> +      q = nir_f2u(bld, q);
>>> +   }
>>> +
>>> +   /* get error of first result: */
>>> +   r = nir_imul(bld, q, b);
>>> +   r = nir_isub(bld, a, r);
>>> +   r = nir_u2f(bld, r);
>>> +
>>> +   r = nir_fmul(bld, r, bf);
>>> +   r = nir_f2u(bld, r);
>>> +
>>> +   /* correction: if modulus >= divisor, add 1 */
>>> +   r = nir_imul(bld, q, b);
>>> +   r = nir_isub(bld, a, r);
>>> +
>>> +   r = nir_fge(bld, r, b);
>>> +   r = nir_b2i(bld, r);
>>> +
>>> +   q = nir_isub(bld, q, r);
>>> +   if (op == nir_op_idiv)  { /* ie. udiv or umod */
>>> +      r = nir_ixor(bld, a, b);
>>> +      r = nir_ushr(bld, r, nir_imm_int(bld, 31));
>>> +      r = nir_i2b(bld, r);
>>> +      b = nir_ineg(bld, q);
>>> +      q = nir_bcsel(bld, r, b, q);
>>> +   }
>>> +
>>> +   if (op == nir_op_umod) {
>>> +      /* division result in q */
>>> +      r = nir_imul(bld, q, b);
>>> +      q = nir_isub(bld, a, r);
>>> +   }
>>> +
>>> +   /* not quite sure if frob'ing the existing instr is what we
>>> +    * are *supposed* to do.. but wasn't sure a better way:
>>> +    */
>>> +   alu->op = nir_op_imov;
>>> +   nir_instr_rewrite_src(&alu->instr, &alu->src[0].src, nir_src_for_ssa(q));
>>> +}
>>> +
>>> +static bool
>>> +convert_block(nir_block *block, void *state)
>>> +{
>>> +   nir_builder *b = state;
>>> +
>>> +   nir_foreach_instr_safe(block, instr) {
>>> +      if (instr->type == nir_instr_type_alu)
>>> +         convert_instr(b, nir_instr_as_alu(instr));
>>> +   }
>>> +
>>> +   return true;
>>> +}
>>> +
>>> +static void
>>> +convert_impl(nir_function_impl *impl)
>>> +{
>>> +   nir_builder b;
>>> +   nir_builder_init(&b, impl);
>>> +
>>> +   nir_foreach_block(impl, convert_block, &b);
>>> +   nir_metadata_preserve(impl, nir_metadata_block_index |
>>> +                               nir_metadata_dominance);
>>> +}
>>> +
>>> +void
>>> +nir_lower_idiv(nir_shader *shader)
>>> +{
>>> +   nir_foreach_overload(shader, overload) {
>>> +      if (overload->impl)
>>> +         convert_impl(overload->impl);
>>> +   }
>>> +
>>> +   exec_list_make_empty(&shader->system_values);
>>> +}
>>>
>>