[PATCH 2/3] drm/radeon: add command submission IDs

Thu Sep 18 21:06:31 PDT 2014

On Thu, Sep 18, 2014 at 11:11:57PM -0400, Jerome Glisse wrote:
> On Thu, Sep 18, 2014 at 08:42:16PM -0400, Jerome Glisse wrote:
> > On Thu, Sep 18, 2014 at 05:30:12PM +0200, Christian König wrote:
> > > From: Christian König <christian.koenig at amd.com>
> > > 
> > > This patch adds a new 64bit ID as a result to each command submission.
> > 
> > NAK for design reasons.
> > 
> > First and foremost we DO NOT WANT TO ALLOCATE ANYTHING for submission id.
> > I know that hooking up to fence make life easy but this is not how it
> > should be done.
> > 
> > What you want to expose is the per ring 32bit seq value and report to user
> > space the value and the ring id. To handle the wrap around you add a uniq
> > 32bit wrap counter which is incremented every 1 << 30 or some big number
> > seq value number (but must be smaller than 1 << 31). Then you use arithmetic
> > to determine if a cs seq number is done or not (ie the seq wrap around
> > counter diff should not be bigger then 2 or 3) and the seq number should
> > obviously be after again using arithmetic diff like jiffies code.
> > 
> > All this being hidden in the kernel all the user space have to know is :
> > 32 bit seq value per ring
> > 32 bit ring uniq id
> > 32 wrap counter per ring
> > 
> > With such scheme you never allocate anything or worry about any fence.
> > Way simpler and less code.
> > 
> > Cheers,
> > Jérôme
> 
> Because code is clearer than words attached is a patch of what i am thinking
> about. The arithmetic should be right as well as the coherency and proper
> memory barrier. Thought it is completely untested and will require couple
> fixes for properly checking userspace arguments and other aspect (see FIXME
> in patches).
> 
> There is no allocation, it is a pretty small patch, and it provide a fast
> ligthweight solution. While the new ioctl argument and return value can be
> improved (like reporting more information to userspace like for instance
> warning userspace when it is querying very old fences). I think the overall
> design is sound.
> 
> Let me know.

Ok actually after a beer (given that planet express manual for Bender, which
also apply to me, clearly stipulate to expect malfunction if not properly
inebriate) it became clear that by reversing the problem i could make it a
lot simpler. So attached is an even simpler patch that handle gracefully user
space querying for very old fence (ie that could have wrap around).

Code explains it all.

> 
> Cheers,
> Jérôme
> 
> > 
> > 
> > > 
> > > Signed-off-by: Christian König <christian.koenig at amd.com>
> > > ---
> > >  drivers/gpu/drm/radeon/Makefile     |   2 +-
> > >  drivers/gpu/drm/radeon/radeon.h     |  13 +-
> > >  drivers/gpu/drm/radeon/radeon_cs.c  |  13 ++
> > >  drivers/gpu/drm/radeon/radeon_kms.c |  41 +++----
> > >  drivers/gpu/drm/radeon/radeon_seq.c | 229 ++++++++++++++++++++++++++++++++++++
> > >  include/uapi/drm/radeon_drm.h       |   1 +
> > >  6 files changed, 277 insertions(+), 22 deletions(-)
> > >  create mode 100644 drivers/gpu/drm/radeon/radeon_seq.c
> > > 
> > > diff --git a/drivers/gpu/drm/radeon/Makefile b/drivers/gpu/drm/radeon/Makefile
> > > index 8cc9f68..21ee928 100644
> > > --- a/drivers/gpu/drm/radeon/Makefile
> > > +++ b/drivers/gpu/drm/radeon/Makefile
> > > @@ -81,7 +81,7 @@ radeon-y += radeon_device.o radeon_asic.o radeon_kms.o \
> > >  	rv770_smc.o cypress_dpm.o btc_dpm.o sumo_dpm.o sumo_smc.o trinity_dpm.o \
> > >  	trinity_smc.o ni_dpm.o si_smc.o si_dpm.o kv_smc.o kv_dpm.o ci_smc.o \
> > >  	ci_dpm.o dce6_afmt.o radeon_vm.o radeon_ucode.o radeon_ib.o radeon_mn.o \
> > > -	radeon_sync.o
> > > +	radeon_sync.o radeon_seq.o
> > >  
> > >  # add async DMA block
> > >  radeon-y += \
> > > diff --git a/drivers/gpu/drm/radeon/radeon.h b/drivers/gpu/drm/radeon/radeon.h
> > > index b3b4e96..dbfd346 100644
> > > --- a/drivers/gpu/drm/radeon/radeon.h
> > > +++ b/drivers/gpu/drm/radeon/radeon.h
> > > @@ -423,6 +423,15 @@ static inline bool radeon_fence_is_earlier(struct radeon_fence *a,
> > >  }
> > >  
> > >  /*
> > > + * Userspace command submission identifier generation
> > > + */
> > > +struct radeon_seq;
> > > +
> > > +uint64_t radeon_seq_push(struct radeon_seq **seq, struct radeon_fence *fence);
> > > +struct radeon_fence *radeon_seq_query(struct radeon_seq *seq, uint64_t id);
> > > +void radeon_seq_destroy(struct radeon_seq **seq);
> > > +
> > > +/*
> > >   * Tiling registers
> > >   */
> > >  struct radeon_surface_reg {
> > > @@ -946,7 +955,9 @@ struct radeon_vm_manager {
> > >   * file private structure
> > >   */
> > >  struct radeon_fpriv {
> > > -	struct radeon_vm		vm;
> > > +	struct radeon_vm	vm;
> > > +	struct mutex		seq_lock;
> > > +	struct radeon_seq	*seq;
> > >  };
> > >  
> > >  /*
> > > diff --git a/drivers/gpu/drm/radeon/radeon_cs.c b/drivers/gpu/drm/radeon/radeon_cs.c
> > > index b8f20a5..0c0f0b3 100644
> > > --- a/drivers/gpu/drm/radeon/radeon_cs.c
> > > +++ b/drivers/gpu/drm/radeon/radeon_cs.c
> > > @@ -413,6 +413,19 @@ static void radeon_cs_parser_fini(struct radeon_cs_parser *parser, int error, bo
> > >  	unsigned i;
> > >  
> > >  	if (!error) {
> > > +		if (parser->chunk_flags &&
> > > +		    parser->chunk_flags->length_dw > 4) {
> > > +			struct radeon_fpriv *fpriv = parser->filp->driver_priv;
> > > +			uint32_t __user *to = parser->chunk_flags->user_ptr;
> > > +			uint64_t id;
> > > +
> > > +			mutex_lock(&fpriv->seq_lock);
> > > +			id = radeon_seq_push(&fpriv->seq, parser->ib.fence);
> > > +			mutex_unlock(&fpriv->seq_lock);
> > > +
> > > +			copy_to_user(&to[3], &id, sizeof(uint64_t));
> > > +		}
> > > +
> > >  		/* Sort the buffer list from the smallest to largest buffer,
> > >  		 * which affects the order of buffers in the LRU list.
> > >  		 * This assures that the smallest buffers are added first
> > > diff --git a/drivers/gpu/drm/radeon/radeon_kms.c b/drivers/gpu/drm/radeon/radeon_kms.c
> > > index 85ee6f7..38880af 100644
> > > --- a/drivers/gpu/drm/radeon/radeon_kms.c
> > > +++ b/drivers/gpu/drm/radeon/radeon_kms.c
> > > @@ -578,39 +578,34 @@ void radeon_driver_lastclose_kms(struct drm_device *dev)
> > >   */
> > >  int radeon_driver_open_kms(struct drm_device *dev, struct drm_file *file_priv)
> > >  {
> > > +	struct radeon_fpriv *fpriv = kzalloc(sizeof(*fpriv), GFP_KERNEL);
> > >  	struct radeon_device *rdev = dev->dev_private;
> > >  	int r;
> > >  
> > > -	file_priv->driver_priv = NULL;
> > > +	if (unlikely(!fpriv))
> > > +		return -ENOMEM;
> > > +
> > > +	file_priv->driver_priv = fpriv;
> > >  
> > >  	r = pm_runtime_get_sync(dev->dev);
> > >  	if (r < 0)
> > > -		return r;
> > > +		goto error;
> > >  
> > >  	/* new gpu have virtual address space support */
> > >  	if (rdev->family >= CHIP_CAYMAN) {
> > > -		struct radeon_fpriv *fpriv;
> > >  		struct radeon_vm *vm;
> > >  		int r;
> > >  
> > > -		fpriv = kzalloc(sizeof(*fpriv), GFP_KERNEL);
> > > -		if (unlikely(!fpriv)) {
> > > -			return -ENOMEM;
> > > -		}
> > > -
> > >  		vm = &fpriv->vm;
> > >  		r = radeon_vm_init(rdev, vm);
> > > -		if (r) {
> > > -			kfree(fpriv);
> > > -			return r;
> > > -		}
> > > +		if (r)
> > > +			goto error;
> > >  
> > >  		if (rdev->accel_working) {
> > >  			r = radeon_bo_reserve(rdev->ring_tmp_bo.bo, false);
> > >  			if (r) {
> > >  				radeon_vm_fini(rdev, vm);
> > > -				kfree(fpriv);
> > > -				return r;
> > > +				goto error;
> > >  			}
> > >  
> > >  			/* map the ib pool buffer read only into
> > > @@ -623,16 +618,20 @@ int radeon_driver_open_kms(struct drm_device *dev, struct drm_file *file_priv)
> > >  						  RADEON_VM_PAGE_SNOOPED);
> > >  			if (r) {
> > >  				radeon_vm_fini(rdev, vm);
> > > -				kfree(fpriv);
> > > -				return r;
> > > +				goto error;
> > >  			}
> > >  		}
> > > -		file_priv->driver_priv = fpriv;
> > >  	}
> > >  
> > > +	mutex_init(&fpriv->seq_lock);
> > > +
> > >  	pm_runtime_mark_last_busy(dev->dev);
> > >  	pm_runtime_put_autosuspend(dev->dev);
> > >  	return 0;
> > > +
> > > +error:
> > > +	kfree(fpriv);
> > > +	return r;
> > >  }
> > >  
> > >  /**
> > > @@ -646,11 +645,13 @@ int radeon_driver_open_kms(struct drm_device *dev, struct drm_file *file_priv)
> > >  void radeon_driver_postclose_kms(struct drm_device *dev,
> > >  				 struct drm_file *file_priv)
> > >  {
> > > +	struct radeon_fpriv *fpriv = file_priv->driver_priv;
> > >  	struct radeon_device *rdev = dev->dev_private;
> > >  
> > > +	radeon_seq_destroy(&fpriv->seq);
> > > +
> > >  	/* new gpu have virtual address space support */
> > >  	if (rdev->family >= CHIP_CAYMAN && file_priv->driver_priv) {
> > > -		struct radeon_fpriv *fpriv = file_priv->driver_priv;
> > >  		struct radeon_vm *vm = &fpriv->vm;
> > >  		int r;
> > >  
> > > @@ -664,9 +665,9 @@ void radeon_driver_postclose_kms(struct drm_device *dev,
> > >  		}
> > >  
> > >  		radeon_vm_fini(rdev, vm);
> > > -		kfree(fpriv);
> > > -		file_priv->driver_priv = NULL;
> > >  	}
> > > +	kfree(fpriv);
> > > +	file_priv->driver_priv = NULL;
> > >  }
> > >  
> > >  /**
> > > diff --git a/drivers/gpu/drm/radeon/radeon_seq.c b/drivers/gpu/drm/radeon/radeon_seq.c
> > > new file mode 100644
> > > index 0000000..d8857f1
> > > --- /dev/null
> > > +++ b/drivers/gpu/drm/radeon/radeon_seq.c
> > > @@ -0,0 +1,229 @@
> > > +/*
> > > + * Copyright 2014 Advanced Micro Devices, Inc.
> > > + * All Rights Reserved.
> > > + *
> > > + * Permission is hereby granted, free of charge, to any person obtaining a
> > > + * copy of this software and associated documentation files (the
> > > + * "Software"), to deal in the Software without restriction, including
> > > + * without limitation the rights to use, copy, modify, merge, publish,
> > > + * distribute, sub license, and/or sell copies of the Software, and to
> > > + * permit persons to whom the Software is furnished to do so, subject to
> > > + * the following conditions:
> > > + *
> > > + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
> > > + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
> > > + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
> > > + * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
> > > + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
> > > + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
> > > + * USE OR OTHER DEALINGS IN THE SOFTWARE.
> > > + *
> > > + * The above copyright notice and this permission notice (including the
> > > + * next paragraph) shall be included in all copies or substantial portions
> > > + * of the Software.
> > > + *
> > > + */
> > > +/*
> > > + * Authors:
> > > + *    Christian König <christian.koenig at amd.com>
> > > + */
> > > +
> > > +#include <drm/drmP.h>
> > > +#include "radeon.h"
> > > +
> > > +/*
> > > + * ID sequences
> > > + * This code generates a 64bit identifier for a command submission.
> > > + * It works by adding the fence of the command submission to a automatically
> > > + * resizing ring buffer.
> > > + */
> > > +
> > > +struct radeon_seq {
> > > +	uint64_t		start;
> > > +	uint64_t		end;
> > > +	uint64_t		mask;
> > > +	struct radeon_seq	*replacement;
> > > +};
> > > +
> > > +/**
> > > + * radeon_seq_create - create a new sequence object
> > > + *
> > > + * @start: start value for this sequence
> > > + * @size: size of the ring buffer, must be power of two
> > > + *
> > > + * Allocate and initialize a new ring buffer and header.
> > > + * Returns NULL if allocation fails, new object otherwise.
> > > + */
> > > +static struct radeon_seq *radeon_seq_create(uint64_t start, unsigned size)
> > > +{
> > > +	unsigned bytes = sizeof(struct radeon_seq) +
> > > +		size * sizeof(struct radeon_fence *);
> > > +
> > > +	struct radeon_seq *seq;
> > > +
> > > +	seq = kmalloc(bytes, GFP_KERNEL);
> > > +	if (!seq)
> > > +		return NULL;
> > > +
> > > +	seq->start = start;
> > > +	seq->end = start;
> > > +	seq->mask = size - 1;
> > > +	seq->replacement = NULL;
> > > +
> > > +	return seq;
> > > +}
> > > +
> > > +/**
> > > + * radeon_seq_ring - get pointer to ring buffer
> > > + *
> > > + * @seq: sequence object
> > > + *
> > > + * Calculate the address of the ring buffer.
> > > + */
> > > +static struct radeon_fence **radeon_seq_ring(struct radeon_seq *seq)
> > > +{
> > > +	return (struct radeon_fence **)&seq[1];
> > > +}
> > > +
> > > +/**
> > > + * radeon_seq_try_free - try to free fences from the ring buffer
> > > + *
> > > + * @seq: sequence object
> > > + *
> > > + * Try to free fences from the start of the ring buffer.
> > > + */
> > > +static void radeon_seq_try_free(struct radeon_seq *seq)
> > > +{
> > > +	struct radeon_fence **ring = radeon_seq_ring(seq);
> > > +
> > > +	while (seq->start != seq->end) {
> > > +		unsigned idx = seq->start & seq->mask;
> > > +		struct radeon_fence *fence = ring[idx];
> > > +
> > > +		if (!radeon_fence_signaled(fence))
> > > +			break;
> > > +
> > > +		radeon_fence_unref(&fence);
> > > +		++seq->start;
> > > +	}
> > > +}
> > > +
> > > +/**
> > > + * radeon_seq_add - add new fence to the end of the ring buffer
> > > + *
> > > + * @seq: sequence object
> > > + * @f: the fence object
> > > + *
> > > + * Add the fence and return the generated ID.
> > > + */
> > > +static uint64_t radeon_seq_add(struct radeon_seq *seq, struct radeon_fence *f)
> > > +{
> > > +	struct radeon_fence **ring = radeon_seq_ring(seq);
> > > +
> > > +	ring[seq->end & seq->mask] = radeon_fence_ref(f);
> > > +	return seq->end++;
> > > +}
> > > +
> > > +/**
> > > + * radeon_seq_push - check for room and add the fence
> > > + *
> > > + * @seq: sequence object
> > > + * @fence: the fence object
> > > + *
> > > + * Check for room on the ring buffer, if there isn't enough
> > > + * reallocate the sequence object and add the fence.
> > > + * Returns the generated ID.
> > > + */
> > > +uint64_t radeon_seq_push(struct radeon_seq **seq, struct radeon_fence *fence)
> > > +{
> > > +	unsigned size_for_new_seq = 4;
> > > +	uint64_t start_for_new_seq = 1;
> > > +
> > > +	if (*seq) {
> > > +		/* try to release old replacements */
> > > +		while ((*seq)->replacement) {
> > > +			radeon_seq_try_free(*seq);
> > > +			if ((*seq)->start == (*seq)->end) {
> > > +				struct radeon_seq *repl = (*seq)->replacement;
> > > +
> > > +				kfree(*seq);
> > > +				*seq = repl;
> > > +			} else {
> > > +				/* move on to the current container */
> > > +				seq = &(*seq)->replacement;
> > > +			}
> > > +		}
> > > +
> > > +		/* check if we have enough room for one more fence */
> > > +		radeon_seq_try_free(*seq);
> > > +		if (((*seq)->end - (*seq)->start) <= (*seq)->mask)
> > > +			return radeon_seq_add(*seq, fence);
> > > +
> > > +		/* not enough room, let's allocate a replacement */
> > > +		size_for_new_seq = ((*seq)->mask + 1) * 2;
> > > +		start_for_new_seq = (*seq)->end + 1;
> > > +		seq = &(*seq)->replacement;
> > > +	}
> > > +
> > > +	*seq = radeon_seq_create(start_for_new_seq, size_for_new_seq);
> > > +	if (!*seq) {
> > > +		/* not enough memory for a new sequence object, but failing
> > > +		   here isn't a good idea either cause the commands are already
> > > +		   submitted to the hardware. So just block on the fence. */
> > > +		int r = radeon_fence_wait(fence, false);
> > > +		if (r)
> > > +			DRM_ERROR("Error waiting for fence (%d)\n", r);
> > > +		return 0;
> > > +	}
> > > +	return radeon_seq_add(*seq, fence);
> > > +}
> > > +
> > > +/**
> > > + * radeon_seq_query - lockup fence by it's ID
> > > + *
> > > + * @seq: sequence object
> > > + * @id: the generated ID
> > > + *
> > > + * Lockup the associated fence by it's ID.
> > > + * Returns fence object or NULL if it couldn't be found.
> > > + */
> > > +struct radeon_fence *radeon_seq_query(struct radeon_seq *seq, uint64_t id)
> > > +{
> > > +	struct radeon_fence **ring;
> > > +
> > > +	while (seq && id > seq->end)
> > > +		seq = seq->replacement;
> > > +
> > > +	if (!seq || id < seq->start)
> > > +		return NULL;
> > > +
> > > +	ring = radeon_seq_ring(seq);
> > > +	return ring[id & seq->mask];
> > > +}
> > > +
> > > +/**
> > > + * radeon_seq_destroy - destroy the sequence object
> > > + *
> > > + * @seq_ptr: pointer to sequence object
> > > + *
> > > + * Destroy the sequence objects and release all fence references taken.
> > > + */
> > > +void radeon_seq_destroy(struct radeon_seq **seq_ptr)
> > > +{
> > > +	struct radeon_seq *seq = *seq_ptr;
> > > +	while (seq) {
> > > +		struct radeon_seq *repl = seq->replacement;
> > > +		unsigned start = seq->start & seq->mask;
> > > +		unsigned end = seq->end & seq->mask;
> > > +		struct radeon_fence **ring;
> > > +		unsigned i;
> > > +
> > > +		ring = radeon_seq_ring(seq);
> > > +		for (i = start; i < end; ++i)
> > > +			radeon_fence_unref(&ring[i]);
> > > +
> > > +		kfree(seq);
> > > +		seq = repl;
> > > +	}
> > > +	*seq_ptr = NULL;
> > > +}
> > > diff --git a/include/uapi/drm/radeon_drm.h b/include/uapi/drm/radeon_drm.h
> > > index 50d0fb4..6b2b2e7 100644
> > > --- a/include/uapi/drm/radeon_drm.h
> > > +++ b/include/uapi/drm/radeon_drm.h
> > > @@ -959,6 +959,7 @@ struct drm_radeon_gem_va {
> > >  #define RADEON_CS_RING_VCE          4
> > >  /* The third dword of RADEON_CHUNK_ID_FLAGS is a sint32 that sets the priority */
> > >  /* 0 = normal, + = higher priority, - = lower priority */
> > > +/* The fourth and fives dword are a 64bit fence ID generated for this CS */
> > >  
> > >  struct drm_radeon_cs_chunk {
> > >  	uint32_t		chunk_id;
> > > -- 
> > > 1.9.1
> > > 
> > > _______________________________________________
> > > dri-devel mailing list
> > > dri-devel at lists.freedesktop.org
> > > http://lists.freedesktop.org/mailman/listinfo/dri-devel

> From b95fe503dad89875da9db2d11d05f93eca41232d Mon Sep 17 00:00:00 2001
> From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Glisse?= <jglisse at redhat.com>
> Date: Thu, 18 Sep 2014 22:51:21 -0400
> Subject: [PATCH] drm/radeon: cs sequence id and cs completion query.
> MIME-Version: 1.0
> Content-Type: text/plain; charset=UTF-8
> Content-Transfer-Encoding: 8bit
> 
> This report back to userspace ring id and sequence number that can
> be use by userspace to query for the completetion of the cs on the
> hardware. This also add a new ioctl to perform such query.
> 
> There is an extra value supplied to userspace the wrap counter that
> is use to detect wrap around and to gracefuly handle the case where
> userspace might be querying about a very, very, very old cs executed
> long time ago in a far far away universe.
> 
> This patch is aimed to introduce the necessary ground work for user
> space explicit synchronization. By allowing userspace to query about
> cs completetion on hardware, user space can perform operation and
> synchronization on buffer by itself without having the cs ioctl to
> implicitly wait for older cs completetion before scheduling new cs.
> This part is however left to a follow-up patch.
> 
> Signed-off-by: Jérôme Glisse <jglisse at redhat.com>
> ---
>  drivers/gpu/drm/radeon/radeon.h       |  2 ++
>  drivers/gpu/drm/radeon/radeon_cs.c    | 60 +++++++++++++++++++++++++++++++++++
>  drivers/gpu/drm/radeon/radeon_fence.c |  4 +++
>  drivers/gpu/drm/radeon/radeon_kms.c   |  1 +
>  include/uapi/drm/radeon_drm.h         |  9 ++++++
>  5 files changed, 76 insertions(+)
> 
> diff --git a/drivers/gpu/drm/radeon/radeon.h b/drivers/gpu/drm/radeon/radeon.h
> index 5f05b4c..1f1dd1f 100644
> --- a/drivers/gpu/drm/radeon/radeon.h
> +++ b/drivers/gpu/drm/radeon/radeon.h
> @@ -118,6 +118,7 @@ extern int radeon_bapm;
>  #define RADEON_DEBUGFS_MAX_COMPONENTS		32
>  #define RADEONFB_CONN_LIMIT			4
>  #define RADEON_BIOS_NUM_SCRATCH			8
> +#define RADEON_SEQ_WRAP_VALUE			(1 << 30)
>  
>  /* fence seq are set to this number when signaled */
>  #define RADEON_FENCE_SIGNALED_SEQ		0LL
> @@ -355,6 +356,7 @@ struct radeon_fence_driver {
>  	/* sync_seq is protected by ring emission lock */
>  	uint64_t			sync_seq[RADEON_NUM_RINGS];
>  	atomic64_t			last_seq;
> +	int32_t				wrap_seq;
>  	bool				initialized;
>  };
>  
> diff --git a/drivers/gpu/drm/radeon/radeon_cs.c b/drivers/gpu/drm/radeon/radeon_cs.c
> index 83f382e..be4ae25 100644
> --- a/drivers/gpu/drm/radeon/radeon_cs.c
> +++ b/drivers/gpu/drm/radeon/radeon_cs.c
> @@ -404,6 +404,17 @@ static void radeon_cs_parser_fini(struct radeon_cs_parser *parser, int error, bo
>  		ttm_eu_fence_buffer_objects(&parser->ticket,
>  					    &parser->validated,
>  					    parser->ib.fence);
> +		if (parser->chunk_flags && parser->chunk_flags->length_dw > 4) {
> +			uint32_t __user *to = parser->chunk_flags->user_ptr;
> +			uint32_t tmp;
> +
> +			tmp = lower_32_bits(parser->ib.fence->seq);
> +			copy_to_user(&to[3], &tmp, sizeof(uint32_t));
> +			tmp = parser->ib.fence->ring;
> +			copy_to_user(&to[4], &tmp, sizeof(uint32_t));
> +			tmp = rdev->fence_drv[tmp]->wrap_seq;
> +			copy_to_user(&to[5], &tmp, sizeof(uint32_t));
> +		}
>  	} else if (backoff) {
>  		ttm_eu_backoff_reservation(&parser->ticket,
>  					   &parser->validated);
> @@ -823,3 +834,52 @@ int radeon_cs_packet_next_reloc(struct radeon_cs_parser *p,
>  		*cs_reloc = p->relocs_ptr[(idx / 4)];
>  	return 0;
>  }
> +
> +int radeon_cs_done_ioctl(struct drm_device *dev, void *data, struct drm_file *filp)
> +{
> +	struct radeon_device *rdev = dev->dev_private;
> +	struct drm_radeon_cs_done *args = data;
> +	unsigned i = args->ring;
> +	int32_t last_seq, sync_seq, wrap_seq;
> +
> +	/* FIXME check args->ring value is ok. */
> +
> +	/*
> +	 * The memory barrier is match with the one in radeon_fence_emit() and
> +	 * it insure us that we get the right matching wrap_seq and sync_seq.
> +	 *
> +	 * Note that no need to protect the fence_drv.sync_seq here as barrier
> +	 * insure us we will get the coherency we need.
> +	 */
> +	wrap_seq = ACCESS_ONCE(rdev->fence_drv[i].wrap_seq);
> +	smp_rmb();
> +	sync_seq = lower_32_bits(ACCESS_ONCE(rdev->fence_drv[i].sync_seq[i]));
> +
> +	/*
> +	 * So if current wrap_seq and one we are queried with are differ by
> +	 * more than one this means that we are queried about a very old fence
> +	 * seq value and we can assume it is long done.
> +	 *
> +	 * Well this is not entirely true, for it to be true we would need to
> +	 * stall when we increment the wrap counter if cs in previous wrap were
> +	 * not completed but this is highly unlikely. So live with the trill of
> +	 * it going wrong !
> +	 */
> +	if (abs((unsigned)wrap_seq - (unsigned)args->wrap) > 1)
> +		return 1;
> +	/* Now check if currently reported fence seq is done or not. */
> +	/* FIXME call fence func to update last_seq just in case. */
> +	last_seq = lower_32_bits(atomic64_read(&rdev->fence_drv[i].last_seq));
> +	if ((last_seq - arg->seq) >= 0)
> +		return 1;
> +	/*
> +	 * Last failsafe to handle the horrible case were userspace holded on
> +	 * a wrap and seq value for so long without querying that it we wrapped
> +	 * around. This is here to avoid userspace waiting for a fence that was
> +	 * emited a long time ago but the current sync_seq[ring] value migth be
> +	 * stuck and thus we might go bigger than this very old seq value.
> +	 */
> +	if (((sync_seq - args->seq) < 0) && args->wrap == wrap_seq)
> +		return 1;
> +	return 0;
> +}
> diff --git a/drivers/gpu/drm/radeon/radeon_fence.c b/drivers/gpu/drm/radeon/radeon_fence.c
> index 9137870..a6adcf6 100644
> --- a/drivers/gpu/drm/radeon/radeon_fence.c
> +++ b/drivers/gpu/drm/radeon/radeon_fence.c
> @@ -119,6 +119,10 @@ int radeon_fence_emit(struct radeon_device *rdev,
>  	kref_init(&((*fence)->kref));
>  	(*fence)->rdev = rdev;
>  	(*fence)->seq = ++rdev->fence_drv[ring].sync_seq[ring];
> +	/* Barrier is important for radeon_fence_cs_done_ioctl. */
> +	smp_wmb();
> +	if (rdev->fence_drv[ring].sync_seq[ring] == RADEON_SEQ_WRAP_VALUE)
> +		rdev->fence_drv[ring].wrap_seq++;
>  	(*fence)->ring = ring;
>  	radeon_fence_ring_emit(rdev, ring, *fence);
>  	trace_radeon_fence_emit(rdev->ddev, ring, (*fence)->seq);
> diff --git a/drivers/gpu/drm/radeon/radeon_kms.c b/drivers/gpu/drm/radeon/radeon_kms.c
> index eb7164d..c9cfcf5 100644
> --- a/drivers/gpu/drm/radeon/radeon_kms.c
> +++ b/drivers/gpu/drm/radeon/radeon_kms.c
> @@ -885,5 +885,6 @@ const struct drm_ioctl_desc radeon_ioctls_kms[] = {
>  	DRM_IOCTL_DEF_DRV(RADEON_GEM_BUSY, radeon_gem_busy_ioctl, DRM_AUTH|DRM_UNLOCKED|DRM_RENDER_ALLOW),
>  	DRM_IOCTL_DEF_DRV(RADEON_GEM_VA, radeon_gem_va_ioctl, DRM_AUTH|DRM_UNLOCKED|DRM_RENDER_ALLOW),
>  	DRM_IOCTL_DEF_DRV(RADEON_GEM_OP, radeon_gem_op_ioctl, DRM_AUTH|DRM_UNLOCKED|DRM_RENDER_ALLOW),
> +	DRM_IOCTL_DEF_DRV(RADEON_CS_DONE, radeon_cs_done_ioctl, DRM_AUTH|DRM_UNLOCKED|DRM_RENDER_ALLOW),
>  };
>  int radeon_max_kms_ioctl = ARRAY_SIZE(radeon_ioctls_kms);
> diff --git a/include/uapi/drm/radeon_drm.h b/include/uapi/drm/radeon_drm.h
> index fea6099..a0c215d 100644
> --- a/include/uapi/drm/radeon_drm.h
> +++ b/include/uapi/drm/radeon_drm.h
> @@ -554,6 +554,7 @@ typedef struct {
>  #define DRM_IOCTL_RADEON_GEM_BUSY	DRM_IOWR(DRM_COMMAND_BASE + DRM_RADEON_GEM_BUSY, struct drm_radeon_gem_busy)
>  #define DRM_IOCTL_RADEON_GEM_VA		DRM_IOWR(DRM_COMMAND_BASE + DRM_RADEON_GEM_VA, struct drm_radeon_gem_va)
>  #define DRM_IOCTL_RADEON_GEM_OP		DRM_IOWR(DRM_COMMAND_BASE + DRM_RADEON_GEM_OP, struct drm_radeon_gem_op)
> +#define DRM_IOCTL_RADEON_CS_DONE	DRM_IOWR(DRM_COMMAND_BASE + DRM_RADEON_CS_DONE, struct drm_radeon_cs_done)
>  
>  typedef struct drm_radeon_init {
>  	enum {
> @@ -936,6 +937,7 @@ struct drm_radeon_gem_va {
>  #define RADEON_CS_RING_VCE          4
>  /* The third dword of RADEON_CHUNK_ID_FLAGS is a sint32 that sets the priority */
>  /* 0 = normal, + = higher priority, - = lower priority */
> +/* The fifth, sixth, seventh  dword are a 32bit fence ID, ring id and wrap id of this CS */
>  
>  struct drm_radeon_cs_chunk {
>  	uint32_t		chunk_id;
> @@ -1038,4 +1040,11 @@ struct drm_radeon_info {
>  
>  #define CIK_TILE_MODE_DEPTH_STENCIL_1D		5
>  
> +struct drm_radeon_cs_done {
> +	int32_t			seq;
> +	int32_t			ring;
> +	int32_t			wrap;
> +	int32_t			pad;
> +};
> +
>  #endif
> -- 
> 1.9.3
> 

-------------- next part --------------
>From 195a9ce19dbe8c4af7e10fa8475d932826434441 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Glisse?= <jglisse at redhat.com>
Date: Thu, 18 Sep 2014 22:51:21 -0400
Subject: [PATCH] drm/radeon: cs sequence id and cs completion query.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This report back to userspace ring id and sequence number that can
be use by userspace to query for the completetion of the cs on the
hardware. This also add a new ioctl to perform such query.

This patch is aimed to introduce the necessary ground work for user
space explicit synchronization. By allowing userspace to query about
cs completetion on hardware, user space can perform operation and
synchronization on buffer by itself without having the cs ioctl to
implicitly wait for older cs completetion before scheduling new cs.
This part is however left to a follow-up patch.

Signed-off-by: Jérôme Glisse <jglisse at redhat.com>
---
 drivers/gpu/drm/radeon/radeon_cs.c  | 41 +++++++++++++++++++++++++++++++++++++
 drivers/gpu/drm/radeon/radeon_kms.c |  1 +
 include/uapi/drm/radeon_drm.h       |  8 ++++++++
 3 files changed, 50 insertions(+)

diff --git a/drivers/gpu/drm/radeon/radeon_cs.c b/drivers/gpu/drm/radeon/radeon_cs.c
index 83f382e..50fcc09 100644
--- a/drivers/gpu/drm/radeon/radeon_cs.c
+++ b/drivers/gpu/drm/radeon/radeon_cs.c
@@ -404,6 +404,15 @@ static void radeon_cs_parser_fini(struct radeon_cs_parser *parser, int error, bo
 		ttm_eu_fence_buffer_objects(&parser->ticket,
 					    &parser->validated,
 					    parser->ib.fence);
+		if (parser->chunk_flags && parser->chunk_flags->length_dw > 4) {
+			uint32_t __user *to = parser->chunk_flags->user_ptr;
+			uint32_t tmp;
+
+			copy_to_user(&to[3], &parser->ib.fence->seq,
+				     sizeof(uint64_t));
+			tmp = parser->ib.fence->ring;
+			copy_to_user(&to[5], &tmp, sizeof(uint32_t));
+		}
 	} else if (backoff) {
 		ttm_eu_backoff_reservation(&parser->ticket,
 					   &parser->validated);
@@ -823,3 +832,35 @@ int radeon_cs_packet_next_reloc(struct radeon_cs_parser *p,
 		*cs_reloc = p->relocs_ptr[(idx / 4)];
 	return 0;
 }
+
+int radeon_cs_done_ioctl(struct drm_device *dev, void *data, struct drm_file *filp)
+{
+	struct radeon_device *rdev = dev->dev_private;
+	struct drm_radeon_cs_done *args = data;
+	unsigned ring = args->ring;
+	int64_t sync_seq, last_seq;
+
+	/* FIXME check args->ring value is ok. */
+
+	/*
+	 * If we consider that last_seq and sync_seq form a range (adjusting
+	 * for wrap around) ]last_seq, sync_seq]. Then queried cs is done if
+	 * its seq value is outside that range.
+	 *
+	 * This gracefully handle wrap around ie case where userspace query a
+	 * very old cs seq value, if seq value is bigger than current sync_seq
+	 * then we know it's and old and signaled fence. If it's smaller or
+	 * equal to last_seq we know it is signaled. If it's inside the range
+	 * ]last_seq, sync_seq] we can not know yet but we can be sure that it
+	 * will signal as at very least the hardware will increase last_seq up
+	 * to sync_seq.
+	 */
+	sync_seq = ACCESS_ONCE(rdev->fence_drv[ring].sync_seq[ring]);
+	last_seq = atomic64_read(&rdev->fence_drv[ring].last_seq);
+	if ((last_seq - args->seq) >= 0)
+		return 1;
+	if ((sync_seq - args->seq) < 0)
+		return 1;
+
+	return 0;
+}
diff --git a/drivers/gpu/drm/radeon/radeon_kms.c b/drivers/gpu/drm/radeon/radeon_kms.c
index eb7164d..c9cfcf5 100644
--- a/drivers/gpu/drm/radeon/radeon_kms.c
+++ b/drivers/gpu/drm/radeon/radeon_kms.c
@@ -885,5 +885,6 @@ const struct drm_ioctl_desc radeon_ioctls_kms[] = {
 	DRM_IOCTL_DEF_DRV(RADEON_GEM_BUSY, radeon_gem_busy_ioctl, DRM_AUTH|DRM_UNLOCKED|DRM_RENDER_ALLOW),
 	DRM_IOCTL_DEF_DRV(RADEON_GEM_VA, radeon_gem_va_ioctl, DRM_AUTH|DRM_UNLOCKED|DRM_RENDER_ALLOW),
 	DRM_IOCTL_DEF_DRV(RADEON_GEM_OP, radeon_gem_op_ioctl, DRM_AUTH|DRM_UNLOCKED|DRM_RENDER_ALLOW),
+	DRM_IOCTL_DEF_DRV(RADEON_CS_DONE, radeon_cs_done_ioctl, DRM_AUTH|DRM_UNLOCKED|DRM_RENDER_ALLOW),
 };
 int radeon_max_kms_ioctl = ARRAY_SIZE(radeon_ioctls_kms);
diff --git a/include/uapi/drm/radeon_drm.h b/include/uapi/drm/radeon_drm.h
index fea6099..ed246d9 100644
--- a/include/uapi/drm/radeon_drm.h
+++ b/include/uapi/drm/radeon_drm.h
@@ -554,6 +554,7 @@ typedef struct {
 #define DRM_IOCTL_RADEON_GEM_BUSY	DRM_IOWR(DRM_COMMAND_BASE + DRM_RADEON_GEM_BUSY, struct drm_radeon_gem_busy)
 #define DRM_IOCTL_RADEON_GEM_VA		DRM_IOWR(DRM_COMMAND_BASE + DRM_RADEON_GEM_VA, struct drm_radeon_gem_va)
 #define DRM_IOCTL_RADEON_GEM_OP		DRM_IOWR(DRM_COMMAND_BASE + DRM_RADEON_GEM_OP, struct drm_radeon_gem_op)
+#define DRM_IOCTL_RADEON_CS_DONE	DRM_IOWR(DRM_COMMAND_BASE + DRM_RADEON_CS_DONE, struct drm_radeon_cs_done)
 
 typedef struct drm_radeon_init {
 	enum {
@@ -936,6 +937,7 @@ struct drm_radeon_gem_va {
 #define RADEON_CS_RING_VCE          4
 /* The third dword of RADEON_CHUNK_ID_FLAGS is a sint32 that sets the priority */
 /* 0 = normal, + = higher priority, - = lower priority */
+/* The third, fourth, fifth  dword are a 32bit fence id, wrap id, ring id */
 
 struct drm_radeon_cs_chunk {
 	uint32_t		chunk_id;
@@ -1038,4 +1040,10 @@ struct drm_radeon_info {
 
 #define CIK_TILE_MODE_DEPTH_STENCIL_1D		5
 
+struct drm_radeon_cs_done {
+	int64_t			seq;
+	int32_t			ring;
+	int32_t			pad;
+};
+
 #endif
-- 
1.9.3