GPU hang trying to run OpenCL kernels on x86_64

Fri May 4 23:15:41 UTC 2018

Hi Slava,

The two x86_64 systems I tried are:
- System One
  Tyan S7025 with dual Xeon X5675 and 48GB registered ECC memory, with
a NVIDIA GTX 1050Ti 4GB(also used for display) and an AMD RX 550 4GB
  Running standard Ubuntu 16.04.4 with kernels
linux-image-4.13.0-38-generic and linux-image-4.4.0-122-generic,
mesa-17.2.8-0ubuntu0, libdrm-2.4.83-1
  and amdgpu-pro 17.50/amdgpu-pro 18.10
  lsb_release -a
  Description: Ubuntu 16.04.4 LTS

  BIOS configuration:
  ACPI enabled v3.0
  ACPI APIC support Enabled
  ACPI SRAT table Enabled
  SR-IOV Enabled
  Intel VT-d Disabled
  PCI MMIO 64 Bits support Disabled

- System Two
  Tyan S7002 with dual Xeon X5670 and 12GB registered ECC memory, with
an AMD RX 480
  Running Ubuntu 18.04 with kernels vanilla 4.16.7 and
linux-image-4.15.0-20-generic, mesa-18.0.0~rc5-1ubuntu1,
libdrm-2.4.91-2
  and mesa-opencl-icd, libclc-0.2.0+git20180312-1

  BIOS configuration:
  ACPI enabled v2.0
  ACPI APIC support Enabled
  ACPI SRAT table Enabled
  SR-IOV Enabled
  Intel VT-d Disabled
  PCI MMIO 64 Bits support Disabled

  amdgpu-pro-install --headless --opencl=legacy

When I try to run the attached openCL code (which computes a
cross-correlation between two square matrices directly by cross
correlation function definition), the GPU hangs, but there are also
other kernels where this also happens.

As soon as I try to run the kernel the system hangs at the first
kernel computation on all the two systems, and after a couple of
seconds dmesg shows:
[drm:amdgpu_job_timedout [amdgpu]] *ERROR* ring gfx timeout, last
signaled seq=2, last emitted seq=3
[drm] IP block:gmc_v8_0 is hung!
[drm] IP block:tonga_ih is hung!
[drm] IP block:gfx_v8_0 is hung!
[drm] IP block:sdma_v3_0 is hung!
[drm] IP block:uvd_v6_0 is hung!
[drm] IP block:vce_v3_0 is hung!
[drm] GPU recovery disabled.

- On another system with armhf 32 bits, 1GB ram, 512GB SSD, AMD RX 480
or AMD RX 550
  with Ubuntu 17.10, vanilla kernel 4.16.7, mesa-18.0.2,
libdrm-2.4.92-git, libclc-git at commit
3d994f2ff2cbb4531223fe2657144cb19f0c5328 (15/Nov/2017)

  The kernels work properly on the same AMD cards.

On Fri, May 4, 2018 at 7:18 PM, Abramov, Slava <Slava.Abramov at amd.com> wrote:
> Luis,
>
>
> Can you please provide more details on your system environment and steps on
> configuring the software and reproducing the issue?
>
>
>
> Slava A
>
> ________________________________
> From: amd-gfx <amd-gfx-bounces at lists.freedesktop.org> on behalf of Luís
> Mendes <luis.p.mendes at gmail.com>
> Sent: Friday, May 4, 2018 12:27:47 PM
> To: amd-gfx list; Koenig, Christian; Michel Dänzer
> Subject: GPU hang trying to run OpenCL kernels on x86_64
>
> Hi,
>
> I am a collaborator with Syncleus/aparapi project on github and I've
> been testing OpenCL on AMD and NVIDIA cards.
>
> Currently I have a set of kernels that hang the GPU (AMD RX 460 and
> AMD RX 550) across all compute units on x86_64 running vanilla kernel
> 4.16.7 on Ubuntu 18.04, also on Ubuntu 16.04.4 with AMDGPU PRO 17.50
> and 18.10 show the same problems, in fact, AMDGPU-PRO 18.10 is even
> worse.
>
> However the same set of kernels run happily on armhf with vanilla
> Linux 4.16.7 and mesa 18.0 (mesa-opencl-icd and libclc for amdgcn),
> Ubuntu 17.10, on an AMD RX460 and an AMD RX 550.
>
> Luís Mendes
> _______________________________________________
> amd-gfx mailing list
> amd-gfx at lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
-------------- next part --------------
May 04, 2018 10:37:41 PM com.aparapi.internal.kernel.KernelRunner executeInternalInner
INFO: typedef struct This_s{
   __global int *tilesGeometry;
   __global int *inputGeometry;
   __global int *threadOutputStart;
   __global int *outputGeometry;
   __global int *threadOffsetI;
   __global int *threadOffsetJ;
   __global float *matrixInF;
   __global float *matrixInG;
   __global float *matrixOut;
   int passid;
}This;
int get_pass_id(This *this){
   return this->passid;
}
short pt_ist_ceris_vipivist_jobs_CrossCorrelationKernel__signX(This *this, short x){
   short value = (short)((x + x) + 1);
   return((short)(value / abs(value)));
}
short pt_ist_ceris_vipivist_jobs_CrossCorrelationKernel__relocateX(This *this, short x, short dimX){
   short result = (short)(((pt_ist_ceris_vipivist_jobs_CrossCorrelationKernel__signX(this, x) + 1) * (x + 1)) / 2);
   result = (short)(((pt_ist_ceris_vipivist_jobs_CrossCorrelationKernel__signX(this, (short)(dimX - result)) + 1) * result) / 2);
   return(result);
}
__kernel void run(
   __global int *tilesGeometry, 
   __global int *inputGeometry, 
   __global int *threadOutputStart, 
   __global int *outputGeometry, 
   __global int *threadOffsetI, 
   __global int *threadOffsetJ, 
   __global float *matrixInF, 
   __global float *matrixInG, 
   __global float *matrixOut, 
   int passid
){
   This thisStruct;
   This* this=&thisStruct;
   this->tilesGeometry = tilesGeometry;
   this->inputGeometry = inputGeometry;
   this->threadOutputStart = threadOutputStart;
   this->outputGeometry = outputGeometry;
   this->threadOffsetI = threadOffsetI;
   this->threadOffsetJ = threadOffsetJ;
   this->matrixInF = matrixInF;
   this->matrixInG = matrixInG;
   this->matrixOut = matrixOut;
   this->passid = passid;
   {
      int sizeI = get_global_size(1);
      int sizeJ = get_global_size(0);
      int i = get_global_id(1);
      int j = get_global_id(0);
      int k = get_global_id(2);
      int il = get_local_id(0);
      int jl = get_local_id(1);
      int matrixInputStart = ((((k * this->tilesGeometry[0]) * this->tilesGeometry[1]) * (this->inputGeometry[1] + 1)) * (this->inputGeometry[0] + 1)) + (((this->threadOutputStart[((i * sizeJ) + j)] / (this->outputGeometry[0] * this->outputGeometry[1])) * (this->inputGeometry[0] + 1)) * (this->inputGeometry[1] + 1));
      int matrixOutputStart = ((((k * this->tilesGeometry[0]) * this->tilesGeometry[1]) * this->outputGeometry[0]) * this->outputGeometry[1]) + this->threadOutputStart[((i * sizeJ) + j)];
      short subMatrixI = (short)this->threadOffsetI[i];
      short subMatrixJ = (short)this->threadOffsetJ[j];
      float accum = 0.0f;
      for (short indexN = (short)(-this->outputGeometry[0] / 2); indexN<=(this->outputGeometry[0] / 2); indexN = (short)(indexN + 1)){
         for (short indexM = (short)(-this->outputGeometry[1] / 2); indexM<=(this->outputGeometry[1] / 2); indexM = (short)(indexM + 1)){
            short fi = pt_ist_ceris_vipivist_jobs_CrossCorrelationKernel__relocateX(this, indexN, (short)this->inputGeometry[0]);
            short fj = pt_ist_ceris_vipivist_jobs_CrossCorrelationKernel__relocateX(this, indexM, (short)this->inputGeometry[1]);
            short gi = pt_ist_ceris_vipivist_jobs_CrossCorrelationKernel__relocateX(this, (short)(subMatrixI + indexN), (short)this->inputGeometry[0]);
            short gj = pt_ist_ceris_vipivist_jobs_CrossCorrelationKernel__relocateX(this, (short)(subMatrixJ + indexM), (short)this->inputGeometry[1]);
            accum = accum + (this->matrixInF[((matrixInputStart + (fi * (this->inputGeometry[1] + 1))) + fj)] * this->matrixInG[((matrixInputStart + (gi * (this->inputGeometry[1] + 1))) + gj)]);
         }
      }
      int outIndex = (matrixOutputStart + ((subMatrixI + (this->outputGeometry[0] / 2)) * this->outputGeometry[1])) + (subMatrixJ + (this->outputGeometry[1] / 2));
      this->matrixOut[outIndex]  = accum;
      return;
   }
}