[Mesa-dev] [PATCH 06/31] swr/rast: Separate RDTSC code from archrast

George Kyriazis george.kyriazis at intel.com
Tue Feb 13 22:42:24 UTC 2018


Renamed rdstc defines more appropriately
---
 src/gallium/drivers/swr/rasterizer/core/api.cpp    | 40 ++++++++--------
 .../drivers/swr/rasterizer/core/backend.cpp        | 24 +++++-----
 .../drivers/swr/rasterizer/core/backend_clear.cpp  |  8 ++--
 .../drivers/swr/rasterizer/core/backend_impl.h     | 38 +++++++--------
 .../drivers/swr/rasterizer/core/backend_sample.cpp | 36 +++++++-------
 .../swr/rasterizer/core/backend_singlesample.cpp   | 32 ++++++-------
 src/gallium/drivers/swr/rasterizer/core/binner.cpp | 18 +++----
 src/gallium/drivers/swr/rasterizer/core/clip.cpp   | 24 +++++-----
 src/gallium/drivers/swr/rasterizer/core/clip.h     |  4 +-
 src/gallium/drivers/swr/rasterizer/core/context.h  | 21 ++++----
 .../drivers/swr/rasterizer/core/frontend.cpp       | 56 +++++++++++-----------
 .../drivers/swr/rasterizer/core/rasterizer.cpp     |  8 ++--
 .../drivers/swr/rasterizer/core/rasterizer_impl.h  | 26 +++++-----
 .../drivers/swr/rasterizer/core/threads.cpp        |  8 ++--
 .../drivers/swr/rasterizer/core/tilemgr.cpp        | 24 +++++-----
 15 files changed, 181 insertions(+), 186 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/core/api.cpp b/src/gallium/drivers/swr/rasterizer/core/api.cpp
index 09b482d..5e27e4d 100644
--- a/src/gallium/drivers/swr/rasterizer/core/api.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/api.cpp
@@ -256,9 +256,9 @@ void QueueWork(SWR_CONTEXT *pContext)
     }
     else
     {
-        AR_API_BEGIN(APIDrawWakeAllThreads, pDC->drawId);
+        RDTSC_BEGIN(APIDrawWakeAllThreads, pDC->drawId);
         WakeAllThreads(pContext);
-        AR_API_END(APIDrawWakeAllThreads, 1);
+        RDTSC_END(APIDrawWakeAllThreads, 1);
     }
 
     // Set current draw context to NULL so that next state call forces a new draw context to be created and populated.
@@ -278,7 +278,7 @@ INLINE void QueueDispatch(SWR_CONTEXT* pContext)
 
 DRAW_CONTEXT* GetDrawContext(SWR_CONTEXT *pContext, bool isSplitDraw = false)
 {
-    AR_API_BEGIN(APIGetDrawContext, 0);
+    RDTSC_BEGIN(APIGetDrawContext, 0);
     // If current draw context is null then need to obtain a new draw context to use from ring.
     if (pContext->pCurDrawContext == nullptr)
     {
@@ -367,7 +367,7 @@ DRAW_CONTEXT* GetDrawContext(SWR_CONTEXT *pContext, bool isSplitDraw = false)
         SWR_ASSERT(isSplitDraw == false, "Split draw should only be used when obtaining a new DC");
     }
 
-    AR_API_END(APIGetDrawContext, 0);
+    RDTSC_END(APIGetDrawContext, 0);
     return pContext->pCurDrawContext;
 }
 
@@ -477,7 +477,7 @@ void SwrSync(HANDLE hContext, PFN_CALLBACK_FUNC pfnFunc, uint64_t userData, uint
     SWR_CONTEXT *pContext = GetContext(hContext);
     DRAW_CONTEXT* pDC = GetDrawContext(pContext);
 
-    AR_API_BEGIN(APISync, 0);
+    RDTSC_BEGIN(APISync, 0);
 
     pDC->FeWork.type = SYNC;
     pDC->FeWork.pfnWork = ProcessSync;
@@ -493,7 +493,7 @@ void SwrSync(HANDLE hContext, PFN_CALLBACK_FUNC pfnFunc, uint64_t userData, uint
     //enqueue
     QueueDraw(pContext);
 
-    AR_API_END(APISync, 1);
+    RDTSC_END(APISync, 1);
 }
 
 void SwrStallBE(HANDLE hContext)
@@ -508,28 +508,28 @@ void SwrWaitForIdle(HANDLE hContext)
 {
     SWR_CONTEXT *pContext = GetContext(hContext);
 
-    AR_API_BEGIN(APIWaitForIdle, 0);
+    RDTSC_BEGIN(APIWaitForIdle, 0);
 
     while (!pContext->dcRing.IsEmpty())
     {
         _mm_pause();
     }
 
-    AR_API_END(APIWaitForIdle, 1);
+    RDTSC_END(APIWaitForIdle, 1);
 }
 
 void SwrWaitForIdleFE(HANDLE hContext)
 {
     SWR_CONTEXT *pContext = GetContext(hContext);
 
-    AR_API_BEGIN(APIWaitForIdle, 0);
+    RDTSC_BEGIN(APIWaitForIdle, 0);
 
     while (pContext->drawsOutstandingFE > 0)
     {
         _mm_pause();
     }
 
-    AR_API_END(APIWaitForIdle, 1);
+    RDTSC_END(APIWaitForIdle, 1);
 }
 
 void SwrSetVertexBuffers(
@@ -1167,7 +1167,7 @@ void DrawInstanced(
     SWR_CONTEXT *pContext = GetContext(hContext);
     DRAW_CONTEXT* pDC = GetDrawContext(pContext);
 
-    AR_API_BEGIN(APIDraw, pDC->drawId);
+    RDTSC_BEGIN(APIDraw, pDC->drawId);
     AR_API_EVENT(DrawInstancedEvent(pDC->drawId, topology, numVertices, startVertex, numInstances, startInstance));
 
     uint32_t maxVertsPerDraw = MaxVertsPerDraw(pDC, numVertices, topology);
@@ -1230,7 +1230,7 @@ void DrawInstanced(
     pDC = GetDrawContext(pContext);
     pDC->pState->state.rastState.cullMode = oldCullMode;
 
-    AR_API_END(APIDraw, numVertices * numInstances);
+    RDTSC_END(APIDraw, numVertices * numInstances);
 }
 
 //////////////////////////////////////////////////////////////////////////
@@ -1295,7 +1295,7 @@ void DrawIndexedInstance(
     DRAW_CONTEXT* pDC = GetDrawContext(pContext);
     API_STATE* pState = &pDC->pState->state;
 
-    AR_API_BEGIN(APIDrawIndexed, pDC->drawId);
+    RDTSC_BEGIN(APIDrawIndexed, pDC->drawId);
     AR_API_EVENT(DrawIndexedInstancedEvent(pDC->drawId, topology, numIndices, indexOffset, baseVertex, numInstances, startInstance));
 
     uint32_t maxIndicesPerDraw = MaxVertsPerDraw(pDC, numIndices, topology);
@@ -1376,7 +1376,7 @@ void DrawIndexedInstance(
     pDC = GetDrawContext(pContext);
     pDC->pState->state.rastState.cullMode = oldCullMode;
  
-    AR_API_END(APIDrawIndexed, numIndices * numInstances);
+    RDTSC_END(APIDrawIndexed, numIndices * numInstances);
 }
 
 
@@ -1508,7 +1508,7 @@ void SwrDispatch(
     SWR_CONTEXT *pContext = GetContext(hContext);
     DRAW_CONTEXT* pDC = GetDrawContext(pContext);
 
-    AR_API_BEGIN(APIDispatch, pDC->drawId);
+    RDTSC_BEGIN(APIDispatch, pDC->drawId);
     AR_API_EVENT(DispatchEvent(pDC->drawId, threadGroupCountX, threadGroupCountY, threadGroupCountZ));
     pDC->isCompute = true;      // This is a compute context.
 
@@ -1524,7 +1524,7 @@ void SwrDispatch(
     pDC->pDispatch->initialize(totalThreadGroups, pTaskData, &ProcessComputeBE);
 
     QueueDispatch(pContext);
-    AR_API_END(APIDispatch, threadGroupCountX * threadGroupCountY * threadGroupCountZ);
+    RDTSC_END(APIDispatch, threadGroupCountX * threadGroupCountY * threadGroupCountZ);
 }
 
 // Deswizzles, converts and stores current contents of the hot tiles to surface
@@ -1543,7 +1543,7 @@ void SWR_API SwrStoreTiles(
     SWR_CONTEXT *pContext = GetContext(hContext);
     DRAW_CONTEXT* pDC = GetDrawContext(pContext);
 
-    AR_API_BEGIN(APIStoreTiles, pDC->drawId);
+    RDTSC_BEGIN(APIStoreTiles, pDC->drawId);
 
     pDC->FeWork.type = STORETILES;
     pDC->FeWork.pfnWork = ProcessStoreTiles;
@@ -1557,7 +1557,7 @@ void SWR_API SwrStoreTiles(
 
     AR_API_EVENT(SwrStoreTilesEvent(pDC->drawId));
 
-    AR_API_END(APIStoreTiles, 1);
+    RDTSC_END(APIStoreTiles, 1);
 }
 
 //////////////////////////////////////////////////////////////////////////
@@ -1586,7 +1586,7 @@ void SWR_API SwrClearRenderTarget(
     SWR_CONTEXT *pContext = GetContext(hContext);
     DRAW_CONTEXT* pDC = GetDrawContext(pContext);
 
-    AR_API_BEGIN(APIClearRenderTarget, pDC->drawId);
+    RDTSC_BEGIN(APIClearRenderTarget, pDC->drawId);
 
     pDC->FeWork.type = CLEAR;
     pDC->FeWork.pfnWork = ProcessClear;
@@ -1604,7 +1604,7 @@ void SWR_API SwrClearRenderTarget(
     // enqueue draw
     QueueDraw(pContext);
 
-    AR_API_END(APIClearRenderTarget, 1);
+    RDTSC_END(APIClearRenderTarget, 1);
 }
 
 //////////////////////////////////////////////////////////////////////////
diff --git a/src/gallium/drivers/swr/rasterizer/core/backend.cpp b/src/gallium/drivers/swr/rasterizer/core/backend.cpp
index 6282e87..5878361 100644
--- a/src/gallium/drivers/swr/rasterizer/core/backend.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/backend.cpp
@@ -48,7 +48,7 @@ void ProcessComputeBE(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t threadGroup
 {
     SWR_CONTEXT *pContext = pDC->pContext;
 
-    AR_BEGIN(BEDispatch, pDC->drawId);
+    RDTSC_BEGIN(BEDispatch, pDC->drawId);
 
     const COMPUTE_DESC* pTaskData = (COMPUTE_DESC*)pDC->pDispatch->GetTasksData();
     SWR_ASSERT(pTaskData != nullptr);
@@ -82,7 +82,7 @@ void ProcessComputeBE(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t threadGroup
 
     UPDATE_STAT_BE(CsInvocations, state.totalThreadsInGroup);
 
-    AR_END(BEDispatch, 1);
+    RDTSC_END(BEDispatch, 1);
 }
 
 //////////////////////////////////////////////////////////////////////////
@@ -107,7 +107,7 @@ void ProcessStoreTileBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile
 {
     SWR_CONTEXT *pContext = pDC->pContext;
 
-    AR_BEGIN(BEStoreTiles, pDC->drawId);
+    RDTSC_BEGIN(BEStoreTiles, pDC->drawId);
 
     SWR_FORMAT srcFormat;
     switch (attachment)
@@ -159,7 +159,7 @@ void ProcessStoreTileBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile
             }
         }
     }
-    AR_END(BEStoreTiles, 1);
+    RDTSC_END(BEStoreTiles, 1);
 }
 
 void ProcessStoreTilesBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pData)
@@ -201,9 +201,9 @@ void BackendNullPS(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y,
 {
     SWR_CONTEXT *pContext = pDC->pContext;
 
-    AR_BEGIN(BENullBackend, pDC->drawId);
+    RDTSC_BEGIN(BENullBackend, pDC->drawId);
     ///@todo: handle center multisample pattern
-    AR_BEGIN(BESetup, pDC->drawId);
+    RDTSC_BEGIN(BESetup, pDC->drawId);
 
     const API_STATE &state = GetApiState(pDC);
 
@@ -216,7 +216,7 @@ void BackendNullPS(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y,
     SWR_PS_CONTEXT psContext;
     // skip SetupPixelShaderContext(&psContext, ...); // not needed here
 
-    AR_END(BESetup, 0);
+    RDTSC_END(BESetup, 0);
 
     simdscalar vYSamplePosUL = _simd_add_ps(vULOffsetsY, _simd_set1_ps(static_cast<float>(y)));
 
@@ -257,7 +257,7 @@ void BackendNullPS(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y,
                         coverageMask &= CalcDepthBoundsAcceptMask(z, minz, maxz);
                     }
 
-                    AR_BEGIN(BEBarycentric, pDC->drawId);
+                    RDTSC_BEGIN(BEBarycentric, pDC->drawId);
 
                     // calculate per sample positions
                     psContext.vX.sample = _simd_add_ps(vXSamplePosUL, samplePos.vX(sample));
@@ -269,7 +269,7 @@ void BackendNullPS(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y,
                     psContext.vZ = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.sample, psContext.vJ.sample);
                     psContext.vZ = state.pfnQuantizeDepth(psContext.vZ);
 
-                    AR_END(BEBarycentric, 0);
+                    RDTSC_END(BEBarycentric, 0);
 
                     // interpolate user clip distance if available
                     if (state.backendState.clipDistanceMask)
@@ -280,13 +280,13 @@ void BackendNullPS(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y,
                     simdscalar vCoverageMask = _simd_vmask_ps(coverageMask);
                     simdscalar stencilPassMask = vCoverageMask;
 
-                    AR_BEGIN(BEEarlyDepthTest, pDC->drawId);
+                    RDTSC_BEGIN(BEEarlyDepthTest, pDC->drawId);
                     simdscalar depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing, work.triFlags.viewportIndex,
                         psContext.vZ, pDepthSample, vCoverageMask, pStencilSample, &stencilPassMask);
                     AR_EVENT(EarlyDepthStencilInfoNullPS(_simd_movemask_ps(depthPassMask), _simd_movemask_ps(stencilPassMask), _simd_movemask_ps(vCoverageMask)));
                     DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
                         pDepthSample, depthPassMask, vCoverageMask, pStencilSample, stencilPassMask);
-                    AR_END(BEEarlyDepthTest, 0);
+                    RDTSC_END(BEEarlyDepthTest, 0);
 
                     uint32_t statMask = _simd_movemask_ps(depthPassMask);
                     uint32_t statCount = _mm_popcnt_u32(statMask);
@@ -307,7 +307,7 @@ void BackendNullPS(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y,
         vYSamplePosUL = _simd_add_ps(vYSamplePosUL, dy);
     }
 
-    AR_END(BENullBackend, 0);
+    RDTSC_END(BENullBackend, 0);
 }
 
 PFN_CLEAR_TILES gClearTilesTable[NUM_SWR_FORMATS] = {};
diff --git a/src/gallium/drivers/swr/rasterizer/core/backend_clear.cpp b/src/gallium/drivers/swr/rasterizer/core/backend_clear.cpp
index 0ef54e2..baaa7e6 100644
--- a/src/gallium/drivers/swr/rasterizer/core/backend_clear.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/backend_clear.cpp
@@ -181,7 +181,7 @@ void ProcessClearBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, vo
 
         SWR_ASSERT(pClear->attachmentMask != 0); // shouldn't be here without a reason.
 
-        AR_BEGIN(BEClear, pDC->drawId);
+        RDTSC_BEGIN(BEClear, pDC->drawId);
 
         if (pClear->attachmentMask & SWR_ATTACHMENT_MASK_COLOR)
         {
@@ -217,13 +217,13 @@ void ProcessClearBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, vo
             pHotTile->state = HOTTILE_CLEAR;
         }
 
-        AR_END(BEClear, 1);
+        RDTSC_END(BEClear, 1);
     }
     else
     {
         // Legacy clear
         CLEAR_DESC *pClear = (CLEAR_DESC*)pUserData;
-        AR_BEGIN(BEClear, pDC->drawId);
+        RDTSC_BEGIN(BEClear, pDC->drawId);
 
         if (pClear->attachmentMask & SWR_ATTACHMENT_MASK_COLOR)
         {
@@ -265,7 +265,7 @@ void ProcessClearBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, vo
             pfnClearTiles(pDC, SWR_ATTACHMENT_STENCIL, macroTile, pClear->renderTargetArrayIndex, clearData, pClear->rect);
         }
 
-        AR_END(BEClear, 1);
+        RDTSC_END(BEClear, 1);
     }
 }
 
diff --git a/src/gallium/drivers/swr/rasterizer/core/backend_impl.h b/src/gallium/drivers/swr/rasterizer/core/backend_impl.h
index 593082b..b62ff03 100644
--- a/src/gallium/drivers/swr/rasterizer/core/backend_impl.h
+++ b/src/gallium/drivers/swr/rasterizer/core/backend_impl.h
@@ -600,7 +600,7 @@ struct PixelRateZTestLoop
                 vCoverageMask[sample] = _simd_and_ps(vCoverageMask[sample], _simd_vmask_ps(CalcDepthBoundsAcceptMask(z, minz, maxz)));
             }
 
-            AR_BEGIN(BEBarycentric, pDC->drawId);
+            RDTSC_BEGIN(BEBarycentric, pDC->drawId);
 
             // calculate per sample positions
             psContext.vX.sample = _simd_add_ps(psContext.vX.UL, samplePos.vX(sample));
@@ -622,7 +622,7 @@ struct PixelRateZTestLoop
                 vZ[sample] = state.pfnQuantizeDepth(vZ[sample]);
             }
 
-            AR_END(BEBarycentric, 0);
+            RDTSC_END(BEBarycentric, 0);
 
             ///@todo: perspective correct vs non-perspective correct clipping?
             // if clip distances are enabled, we need to interpolate for each sample
@@ -635,13 +635,13 @@ struct PixelRateZTestLoop
 
             // ZTest for this sample
             ///@todo Need to uncomment out this bucket.
-            //AR_BEGIN(BEDepthBucket, pDC->drawId);
+            //RDTSC_BEGIN(BEDepthBucket, pDC->drawId);
             depthPassMask[sample] = vCoverageMask[sample];
             stencilPassMask[sample] = vCoverageMask[sample];
             depthPassMask[sample] = DepthStencilTest(&state, work.triFlags.frontFacing, work.triFlags.viewportIndex,
                                                      vZ[sample], pDepthSample, vCoverageMask[sample], 
                                                      pStencilSample, &stencilPassMask[sample]);
-            //AR_END(BEDepthBucket, 0);
+            //RDTSC_END(BEDepthBucket, 0);
 
             // early-exit if no pixels passed depth or earlyZ is forced on
             if(psState.forceEarlyZ || !_simd_movemask_ps(depthPassMask[sample]))
@@ -869,8 +869,8 @@ void BackendPixelRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t
 
     SWR_CONTEXT *pContext = pDC->pContext;
 
-    AR_BEGIN(BEPixelRateBackend, pDC->drawId);
-    AR_BEGIN(BESetup, pDC->drawId);
+    RDTSC_BEGIN(BEPixelRateBackend, pDC->drawId);
+    RDTSC_BEGIN(BESetup, pDC->drawId);
 
     const API_STATE &state = GetApiState(pDC);
 
@@ -884,7 +884,7 @@ void BackendPixelRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t
     uint8_t *pDepthBuffer, *pStencilBuffer;
     SetupRenderBuffers(psContext.pColorBuffer, &pDepthBuffer, &pStencilBuffer, state.colorHottileEnable, renderBuffers);
 
-    AR_END(BESetup, 0);
+    RDTSC_END(BESetup, 0);
 
     PixelRateZTestLoop<T> PixelRateZTest(pDC, workerId, work, coeffs, state, pDepthBuffer, pStencilBuffer, state.backendState.clipDistanceMask);
 
@@ -916,13 +916,13 @@ void BackendPixelRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t
                 generateInputCoverage<T, T::InputCoverage>(pCoverageMask, psContext.inputMask, state.blendState.sampleMask);
             }
 
-            AR_BEGIN(BEBarycentric, pDC->drawId);
+            RDTSC_BEGIN(BEBarycentric, pDC->drawId);
 
             CalcPixelBarycentrics(coeffs, psContext);
 
             CalcCentroid<T, false>(&psContext, samplePos, coeffs, work.coverageMask, state.blendState.sampleMask);
 
-            AR_END(BEBarycentric, 0);
+            RDTSC_END(BEBarycentric, 0);
 
             if(T::bForcedSampleCount)
             {
@@ -944,11 +944,11 @@ void BackendPixelRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t
 
             if(state.psState.usesSourceDepth)
             {
-                AR_BEGIN(BEBarycentric, pDC->drawId);
+                RDTSC_BEGIN(BEBarycentric, pDC->drawId);
                 // interpolate and quantize z
                 psContext.vZ = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.center, psContext.vJ.center);
                 psContext.vZ = state.pfnQuantizeDepth(psContext.vZ);
-                AR_END(BEBarycentric, 0);
+                RDTSC_END(BEBarycentric, 0);
             }
 
             // pixels that are currently active
@@ -956,10 +956,10 @@ void BackendPixelRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t
             psContext.oMask = T::MultisampleT::FullSampleMask();
 
             // execute pixel shader
-            AR_BEGIN(BEPixelShader, pDC->drawId);
+            RDTSC_BEGIN(BEPixelShader, pDC->drawId);
             state.psState.pfnPixelShader(GetPrivateState(pDC), &psContext);
             UPDATE_STAT_BE(PsInvocations, _mm_popcnt_u32(_simd_movemask_ps(activeLanes)));
-            AR_END(BEPixelShader, 0);
+            RDTSC_END(BEPixelShader, 0);
 
             // update active lanes to remove any discarded or oMask'd pixels
             activeLanes = _simd_castsi_ps(_simd_and_si(psContext.activeMask, _simd_cmpgt_epi32(psContext.oMask, _simd_setzero_si())));
@@ -980,7 +980,7 @@ void BackendPixelRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t
             // loop over all samples, broadcasting the results of the PS to all passing pixels
             for(uint32_t sample = 0; sample < GetNumOMSamples<T>(state.blendState.sampleCount); sample++)
             {
-                AR_BEGIN(BEOutputMerger, pDC->drawId);
+                RDTSC_BEGIN(BEOutputMerger, pDC->drawId);
                 // center pattern does a single coverage/depth/stencil test, standard pattern tests all samples
                 uint32_t coverageSampleNum = (T::bIsCenterPattern) ? 0 : sample;
                 simdscalar coverageMask, depthMask;
@@ -995,7 +995,7 @@ void BackendPixelRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t
                     if(!_simd_movemask_ps(depthMask))
                     {
                         // stencil should already have been written in early/lateZ tests
-                        AR_END(BEOutputMerger, 0);
+                        RDTSC_END(BEOutputMerger, 0);
                         continue;
                     }
                 }
@@ -1015,10 +1015,10 @@ void BackendPixelRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t
                     DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, PixelRateZTest.vZ[coverageSampleNum],
                                       pDepthSample, depthMask, coverageMask, pStencilSample, PixelRateZTest.stencilPassMask[coverageSampleNum]);
                 }
-                AR_END(BEOutputMerger, 0);
+                RDTSC_END(BEOutputMerger, 0);
             }
 Endtile:
-            AR_BEGIN(BEEndTile, pDC->drawId);
+            RDTSC_BEGIN(BEEndTile, pDC->drawId);
 
             for(uint32_t sample = 0; sample < T::MultisampleT::numCoverageSamples; sample++)
             {
@@ -1054,7 +1054,7 @@ Endtile:
             pDepthBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp) / 8;
             pStencilBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8;
 
-            AR_END(BEEndTile, 0);
+            RDTSC_END(BEEndTile, 0);
 
             psContext.vX.UL     = _simd_add_ps(psContext.vX.UL,     dx);
             psContext.vX.center = _simd_add_ps(psContext.vX.center, dx);
@@ -1064,7 +1064,7 @@ Endtile:
         psContext.vY.center = _simd_add_ps(psContext.vY.center, dy);
     }
 
-    AR_END(BEPixelRateBackend, 0);
+    RDTSC_END(BEPixelRateBackend, 0);
 }
 
 template<uint32_t sampleCountT = SWR_MULTISAMPLE_1X, uint32_t isCenter = 0,
diff --git a/src/gallium/drivers/swr/rasterizer/core/backend_sample.cpp b/src/gallium/drivers/swr/rasterizer/core/backend_sample.cpp
index 04e34aa..8f0a5b1 100644
--- a/src/gallium/drivers/swr/rasterizer/core/backend_sample.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/backend_sample.cpp
@@ -42,8 +42,8 @@ void BackendSampleRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_
 {
     SWR_CONTEXT *pContext = pDC->pContext;
 
-    AR_BEGIN(BESampleRateBackend, pDC->drawId);
-    AR_BEGIN(BESetup, pDC->drawId);
+    RDTSC_BEGIN(BESampleRateBackend, pDC->drawId);
+    RDTSC_BEGIN(BESetup, pDC->drawId);
 
     const API_STATE &state = GetApiState(pDC);
 
@@ -57,7 +57,7 @@ void BackendSampleRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_
     uint8_t *pDepthBuffer, *pStencilBuffer;
     SetupRenderBuffers(psContext.pColorBuffer, &pDepthBuffer, &pStencilBuffer, state.colorHottileEnable, renderBuffers);
 
-    AR_END(BESetup, 0);
+    RDTSC_END(BESetup, 0);
 
     psContext.vY.UL = _simd_add_ps(vULOffsetsY, _simd_set1_ps(static_cast<float>(y)));
     psContext.vY.center = _simd_add_ps(vCenterOffsetsY, _simd_set1_ps(static_cast<float>(y)));
@@ -83,13 +83,13 @@ void BackendSampleRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_
                 generateInputCoverage<T, T::InputCoverage>(pCoverageMask, psContext.inputMask, state.blendState.sampleMask);
             }
 
-            AR_BEGIN(BEBarycentric, pDC->drawId);
+            RDTSC_BEGIN(BEBarycentric, pDC->drawId);
 
             CalcPixelBarycentrics(coeffs, psContext);
 
             CalcCentroid<T, false>(&psContext, samplePos, coeffs, work.coverageMask, state.blendState.sampleMask);
 
-            AR_END(BEBarycentric, 0);
+            RDTSC_END(BEBarycentric, 0);
 
             for (uint32_t sample = 0; sample < T::MultisampleT::numSamples; sample++)
             {
@@ -113,7 +113,7 @@ void BackendSampleRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_
                         coverageMask &= CalcDepthBoundsAcceptMask(z, minz, maxz);
                     }
 
-                    AR_BEGIN(BEBarycentric, pDC->drawId);
+                    RDTSC_BEGIN(BEBarycentric, pDC->drawId);
 
                     // calculate per sample positions
                     psContext.vX.sample = _simd_add_ps(psContext.vX.UL, samplePos.vX(sample));
@@ -125,7 +125,7 @@ void BackendSampleRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_
                     psContext.vZ = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.sample, psContext.vJ.sample);
                     psContext.vZ = state.pfnQuantizeDepth(psContext.vZ);
 
-                    AR_END(BEBarycentric, 0);
+                    RDTSC_END(BEBarycentric, 0);
 
                     // interpolate user clip distance if available
                     if (state.backendState.clipDistanceMask)
@@ -140,11 +140,11 @@ void BackendSampleRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_
                     // Early-Z?
                     if (T::bCanEarlyZ)
                     {
-                        AR_BEGIN(BEEarlyDepthTest, pDC->drawId);
+                        RDTSC_BEGIN(BEEarlyDepthTest, pDC->drawId);
                         depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing, work.triFlags.viewportIndex,
                             psContext.vZ, pDepthSample, vCoverageMask, pStencilSample, &stencilPassMask);
                         AR_EVENT(EarlyDepthStencilInfoSampleRate(_simd_movemask_ps(depthPassMask), _simd_movemask_ps(stencilPassMask), _simd_movemask_ps(vCoverageMask)));
-                        AR_END(BEEarlyDepthTest, 0);
+                        RDTSC_END(BEEarlyDepthTest, 0);
 
                         // early-exit if no samples passed depth or earlyZ is forced on.
                         if (state.psState.forceEarlyZ || !_simd_movemask_ps(depthPassMask))
@@ -164,21 +164,21 @@ void BackendSampleRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_
                     psContext.activeMask = _simd_castps_si(vCoverageMask);
 
                     // execute pixel shader
-                    AR_BEGIN(BEPixelShader, pDC->drawId);
+                    RDTSC_BEGIN(BEPixelShader, pDC->drawId);
                     UPDATE_STAT_BE(PsInvocations, _mm_popcnt_u32(_simd_movemask_ps(vCoverageMask)));
                     state.psState.pfnPixelShader(GetPrivateState(pDC), &psContext);
-                    AR_END(BEPixelShader, 0);
+                    RDTSC_END(BEPixelShader, 0);
 
                     vCoverageMask = _simd_castsi_ps(psContext.activeMask);
 
                     // late-Z
                     if (!T::bCanEarlyZ)
                     {
-                        AR_BEGIN(BELateDepthTest, pDC->drawId);
+                        RDTSC_BEGIN(BELateDepthTest, pDC->drawId);
                         depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing, work.triFlags.viewportIndex,
                             psContext.vZ, pDepthSample, vCoverageMask, pStencilSample, &stencilPassMask);
                         AR_EVENT(LateDepthStencilInfoSampleRate(_simd_movemask_ps(depthPassMask), _simd_movemask_ps(stencilPassMask), _simd_movemask_ps(vCoverageMask)));
-                        AR_END(BELateDepthTest, 0);
+                        RDTSC_END(BELateDepthTest, 0);
 
                         if (!_simd_movemask_ps(depthPassMask))
                         {
@@ -196,7 +196,7 @@ void BackendSampleRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_
                     UPDATE_STAT_BE(DepthPassCount, statCount);
 
                     // output merger
-                    AR_BEGIN(BEOutputMerger, pDC->drawId);
+                    RDTSC_BEGIN(BEOutputMerger, pDC->drawId);
 #if USE_8x2_TILE_BACKEND
                     OutputMerger8x2(psContext, psContext.pColorBuffer, sample, &state.blendState, state.pfnBlendFunc, vCoverageMask, depthPassMask, state.psState.renderTargetMask, useAlternateOffset);
 #else
@@ -209,7 +209,7 @@ void BackendSampleRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_
                         DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
                             pDepthSample, depthPassMask, vCoverageMask, pStencilSample, stencilPassMask);
                     }
-                    AR_END(BEOutputMerger, 0);
+                    RDTSC_END(BEOutputMerger, 0);
                 }
                 work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
             }
@@ -217,7 +217,7 @@ void BackendSampleRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_
         Endtile:
             ATTR_UNUSED;
 
-            AR_BEGIN(BEEndTile, pDC->drawId);
+            RDTSC_BEGIN(BEEndTile, pDC->drawId);
 
             if (T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE)
             {
@@ -247,7 +247,7 @@ void BackendSampleRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_
             pDepthBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp) / 8;
             pStencilBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8;
 
-            AR_END(BEEndTile, 0);
+            RDTSC_END(BEEndTile, 0);
 
             psContext.vX.UL = _simd_add_ps(psContext.vX.UL, dx);
             psContext.vX.center = _simd_add_ps(psContext.vX.center, dx);
@@ -257,7 +257,7 @@ void BackendSampleRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_
         psContext.vY.center = _simd_add_ps(psContext.vY.center, dy);
     }
 
-    AR_END(BESampleRateBackend, 0);
+    RDTSC_END(BESampleRateBackend, 0);
 }
 
 // Recursive template used to auto-nest conditionals.  Converts dynamic enum function
diff --git a/src/gallium/drivers/swr/rasterizer/core/backend_singlesample.cpp b/src/gallium/drivers/swr/rasterizer/core/backend_singlesample.cpp
index 686b979..57338af 100644
--- a/src/gallium/drivers/swr/rasterizer/core/backend_singlesample.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/backend_singlesample.cpp
@@ -42,8 +42,8 @@ void BackendSingleSample(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint3
 {
     SWR_CONTEXT *pContext = pDC->pContext;
 
-    AR_BEGIN(BESingleSampleBackend, pDC->drawId);
-    AR_BEGIN(BESetup, pDC->drawId);
+    RDTSC_BEGIN(BESingleSampleBackend, pDC->drawId);
+    RDTSC_BEGIN(BESetup, pDC->drawId);
 
     const API_STATE &state = GetApiState(pDC);
 
@@ -57,7 +57,7 @@ void BackendSingleSample(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint3
     uint8_t *pDepthBuffer, *pStencilBuffer;
     SetupRenderBuffers(psContext.pColorBuffer, &pDepthBuffer, &pStencilBuffer, state.colorHottileEnable, renderBuffers);
 
-    AR_END(BESetup, 1);
+    RDTSC_END(BESetup, 1);
 
     psContext.vY.UL     = _simd_add_ps(vULOffsetsY,     _simd_set1_ps(static_cast<float>(y)));
     psContext.vY.center = _simd_add_ps(vCenterOffsetsY, _simd_set1_ps(static_cast<float>(y)));
@@ -99,7 +99,7 @@ void BackendSingleSample(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint3
                     generateInputCoverage<T, T::InputCoverage>(pCoverageMask, psContext.inputMask, state.blendState.sampleMask);
                 }
 
-                AR_BEGIN(BEBarycentric, pDC->drawId);
+                RDTSC_BEGIN(BEBarycentric, pDC->drawId);
 
                 CalcPixelBarycentrics(coeffs, psContext);
 
@@ -109,7 +109,7 @@ void BackendSingleSample(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint3
                 psContext.vZ = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.center, psContext.vJ.center);
                 psContext.vZ = state.pfnQuantizeDepth(psContext.vZ);
 
-                AR_END(BEBarycentric, 1);
+                RDTSC_END(BEBarycentric, 1);
 
                 // interpolate user clip distance if available
                 if (state.backendState.clipDistanceMask)
@@ -124,11 +124,11 @@ void BackendSingleSample(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint3
                 // Early-Z?
                 if (T::bCanEarlyZ)
                 {
-                    AR_BEGIN(BEEarlyDepthTest, pDC->drawId);
+                    RDTSC_BEGIN(BEEarlyDepthTest, pDC->drawId);
                     depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing, work.triFlags.viewportIndex,
                                                      psContext.vZ, pDepthBuffer, vCoverageMask, pStencilBuffer, &stencilPassMask);
                     AR_EVENT(EarlyDepthStencilInfoSingleSample(_simd_movemask_ps(depthPassMask), _simd_movemask_ps(stencilPassMask), _simd_movemask_ps(vCoverageMask)));
-                    AR_END(BEEarlyDepthTest, 0);
+                    RDTSC_END(BEEarlyDepthTest, 0);
 
                     // early-exit if no pixels passed depth or earlyZ is forced on
                     if (state.psState.forceEarlyZ || !_simd_movemask_ps(depthPassMask))
@@ -147,21 +147,21 @@ void BackendSingleSample(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint3
                 psContext.activeMask = _simd_castps_si(vCoverageMask);
 
                 // execute pixel shader
-                AR_BEGIN(BEPixelShader, pDC->drawId);
+                RDTSC_BEGIN(BEPixelShader, pDC->drawId);
                 UPDATE_STAT_BE(PsInvocations, _mm_popcnt_u32(_simd_movemask_ps(vCoverageMask)));
                 state.psState.pfnPixelShader(GetPrivateState(pDC), &psContext);
-                AR_END(BEPixelShader, 0);
+                RDTSC_END(BEPixelShader, 0);
 
                 vCoverageMask = _simd_castsi_ps(psContext.activeMask);
 
                 // late-Z
                 if (!T::bCanEarlyZ)
                 {
-                    AR_BEGIN(BELateDepthTest, pDC->drawId);
+                    RDTSC_BEGIN(BELateDepthTest, pDC->drawId);
                     depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing, work.triFlags.viewportIndex,
                                                         psContext.vZ, pDepthBuffer, vCoverageMask, pStencilBuffer, &stencilPassMask);
                     AR_EVENT(LateDepthStencilInfoSingleSample(_simd_movemask_ps(depthPassMask), _simd_movemask_ps(stencilPassMask), _simd_movemask_ps(vCoverageMask)));
-                    AR_END(BELateDepthTest, 0);
+                    RDTSC_END(BELateDepthTest, 0);
 
                     if (!_simd_movemask_ps(depthPassMask))
                     {
@@ -181,7 +181,7 @@ void BackendSingleSample(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint3
                 UPDATE_STAT_BE(DepthPassCount, statCount);
 
                 // output merger
-                AR_BEGIN(BEOutputMerger, pDC->drawId);
+                RDTSC_BEGIN(BEOutputMerger, pDC->drawId);
 #if USE_8x2_TILE_BACKEND
                 OutputMerger8x2(psContext, psContext.pColorBuffer, 0, &state.blendState, state.pfnBlendFunc, vCoverageMask, depthPassMask, state.psState.renderTargetMask, useAlternateOffset);
 #else
@@ -194,11 +194,11 @@ void BackendSingleSample(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint3
                     DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
                         pDepthBuffer, depthPassMask, vCoverageMask, pStencilBuffer, stencilPassMask);
                 }
-                AR_END(BEOutputMerger, 0);
+                RDTSC_END(BEOutputMerger, 0);
             }
 
 Endtile:
-            AR_BEGIN(BEEndTile, pDC->drawId);
+            RDTSC_BEGIN(BEEndTile, pDC->drawId);
 
             work.coverageMask[0] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
             if(T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE)
@@ -229,7 +229,7 @@ Endtile:
             pDepthBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp) / 8;
             pStencilBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8;
 
-            AR_END(BEEndTile, 0);
+            RDTSC_END(BEEndTile, 0);
 
             psContext.vX.UL     = _simd_add_ps(psContext.vX.UL,     dx);
             psContext.vX.center = _simd_add_ps(psContext.vX.center, dx);
@@ -239,7 +239,7 @@ Endtile:
         psContext.vY.center = _simd_add_ps(psContext.vY.center, dy);
     }
 
-    AR_END(BESingleSampleBackend, 0);
+    RDTSC_END(BESingleSampleBackend, 0);
 }
 
 // Recursive template used to auto-nest conditionals.  Converts dynamic enum function
diff --git a/src/gallium/drivers/swr/rasterizer/core/binner.cpp b/src/gallium/drivers/swr/rasterizer/core/binner.cpp
index e2f3264..986ecc6 100644
--- a/src/gallium/drivers/swr/rasterizer/core/binner.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/binner.cpp
@@ -651,7 +651,7 @@ void SIMDCALL BinTrianglesImpl(
     SWR_CONTEXT *pContext = pDC->pContext;
     const uint32_t *aRTAI = reinterpret_cast<const uint32_t *>(&rtIdx);
 
-    AR_BEGIN(FEBinTriangles, pDC->drawId);
+    RDTSC_BEGIN(FEBinTriangles, pDC->drawId);
 
     const API_STATE& state = GetApiState(pDC);
     const SWR_RASTSTATE& rastState = state.rastState;
@@ -958,7 +958,7 @@ void SIMDCALL BinTrianglesImpl(
 
             if (!triMask)
             {
-                AR_END(FEBinTriangles, 1);
+                RDTSC_END(FEBinTriangles, 1);
                 return;
             }
         }
@@ -998,7 +998,7 @@ endBinTriangles:
 
         BinPostSetupLinesImpl<SIMD_T, SIMD_WIDTH>(pDC, pa, workerId, line, recipW, triMask, primID, viewportIdx, rtIdx);
 
-        AR_END(FEBinTriangles, 1);
+        RDTSC_END(FEBinTriangles, 1);
         return;
     }
     else if (rastState.fillMode == SWR_FILLMODE_POINT)
@@ -1008,7 +1008,7 @@ endBinTriangles:
         BinPostSetupPointsImpl<SIMD_T, SIMD_WIDTH>(pDC, pa, workerId, &tri[1], triMask, primID, viewportIdx, rtIdx);
         BinPostSetupPointsImpl<SIMD_T, SIMD_WIDTH>(pDC, pa, workerId, &tri[2], triMask, primID, viewportIdx, rtIdx);
 
-        AR_END(FEBinTriangles, 1);
+        RDTSC_END(FEBinTriangles, 1);
         return;
     }
 
@@ -1114,7 +1114,7 @@ endBinTriangles:
                      triMask &= ~(1 << triIndex);
     }
 
-    AR_END(FEBinTriangles, 1);
+    RDTSC_END(FEBinTriangles, 1);
 }
 
 template <typename CT>
@@ -1197,7 +1197,7 @@ void BinPostSetupPointsImpl(
 {
     SWR_CONTEXT *pContext = pDC->pContext;
 
-    AR_BEGIN(FEBinPoints, pDC->drawId);
+    RDTSC_BEGIN(FEBinPoints, pDC->drawId);
 
     typename SIMD_T::Vec4 &primVerts = prim[0];
 
@@ -1480,7 +1480,7 @@ void BinPostSetupPointsImpl(
         }
     }
 
-    AR_END(FEBinPoints, 1);
+    RDTSC_END(FEBinPoints, 1);
 }
 
 //////////////////////////////////////////////////////////////////////////
@@ -1608,7 +1608,7 @@ void BinPostSetupLinesImpl(
     SWR_CONTEXT *pContext = pDC->pContext;
     const uint32_t *aRTAI = reinterpret_cast<const uint32_t *>(&rtIdx);
 
-    AR_BEGIN(FEBinLines, pDC->drawId);
+    RDTSC_BEGIN(FEBinLines, pDC->drawId);
 
     const API_STATE &state = GetApiState(pDC);
     const SWR_RASTSTATE &rastState = state.rastState;
@@ -1789,7 +1789,7 @@ void BinPostSetupLinesImpl(
 
 endBinLines:
 
-    AR_END(FEBinLines, 1);
+    RDTSC_END(FEBinLines, 1);
 }
 
 //////////////////////////////////////////////////////////////////////////
diff --git a/src/gallium/drivers/swr/rasterizer/core/clip.cpp b/src/gallium/drivers/swr/rasterizer/core/clip.cpp
index 7205802..22d89bc 100644
--- a/src/gallium/drivers/swr/rasterizer/core/clip.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/clip.cpp
@@ -164,30 +164,30 @@ void ClipTriangles(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvecto
                    simdscalari const &primId, simdscalari const &viewportIdx, simdscalari const &rtIdx)
 {
     SWR_CONTEXT *pContext = pDC->pContext;
-    AR_BEGIN(FEClipTriangles, pDC->drawId);
+    RDTSC_BEGIN(FEClipTriangles, pDC->drawId);
     Clipper<SIMD256, 3> clipper(workerId, pDC);
     clipper.ExecuteStage(pa, prims, primMask, primId, viewportIdx, rtIdx);
-    AR_END(FEClipTriangles, 1);
+    RDTSC_END(FEClipTriangles, 1);
 }
 
 void ClipLines(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask,
                simdscalari const &primId, simdscalari const &viewportIdx, simdscalari const &rtIdx)
 {
     SWR_CONTEXT *pContext = pDC->pContext;
-    AR_BEGIN(FEClipLines, pDC->drawId);
+    RDTSC_BEGIN(FEClipLines, pDC->drawId);
     Clipper<SIMD256, 2> clipper(workerId, pDC);
     clipper.ExecuteStage(pa, prims, primMask, primId, viewportIdx, rtIdx);
-    AR_END(FEClipLines, 1);
+    RDTSC_END(FEClipLines, 1);
 }
 
 void ClipPoints(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask,
                 simdscalari const &primId, simdscalari const &viewportIdx, simdscalari const &rtIdx)
 {
     SWR_CONTEXT *pContext = pDC->pContext;
-    AR_BEGIN(FEClipPoints, pDC->drawId);
+    RDTSC_BEGIN(FEClipPoints, pDC->drawId);
     Clipper<SIMD256, 1> clipper(workerId, pDC);
     clipper.ExecuteStage(pa, prims, primMask, primId, viewportIdx, rtIdx);
-    AR_END(FEClipPoints, 1);
+    RDTSC_END(FEClipPoints, 1);
 }
 
 #if USE_SIMD16_FRONTEND
@@ -195,7 +195,7 @@ void SIMDCALL ClipTriangles_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t wor
                                    simd16scalari const &primId, simd16scalari const &viewportIdx, simd16scalari const &rtIdx)
 {
     SWR_CONTEXT *pContext = pDC->pContext;
-    AR_BEGIN(FEClipTriangles, pDC->drawId);
+    RDTSC_BEGIN(FEClipTriangles, pDC->drawId);
 
     enum { VERTS_PER_PRIM = 3 };
 
@@ -204,14 +204,14 @@ void SIMDCALL ClipTriangles_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t wor
     pa.useAlternateOffset = false;
     clipper.ExecuteStage(pa, prims, primMask, primId, viewportIdx, rtIdx);
 
-    AR_END(FEClipTriangles, 1);
+    RDTSC_END(FEClipTriangles, 1);
 }
 
 void SIMDCALL ClipLines_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[], uint32_t primMask,
                                simd16scalari const &primId, simd16scalari const &viewportIdx, simd16scalari const &rtIdx)
 {
     SWR_CONTEXT *pContext = pDC->pContext;
-    AR_BEGIN(FEClipLines, pDC->drawId);
+    RDTSC_BEGIN(FEClipLines, pDC->drawId);
 
     enum { VERTS_PER_PRIM = 2 };
 
@@ -220,14 +220,14 @@ void SIMDCALL ClipLines_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerI
     pa.useAlternateOffset = false;
     clipper.ExecuteStage(pa, prims, primMask, primId, viewportIdx, rtIdx);
 
-    AR_END(FEClipLines, 1);
+    RDTSC_END(FEClipLines, 1);
 }
 
 void SIMDCALL ClipPoints_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[], uint32_t primMask,
                                 simd16scalari const &primId, simd16scalari const &viewportIdx, simd16scalari const &rtIdx)
 {
     SWR_CONTEXT *pContext = pDC->pContext;
-    AR_BEGIN(FEClipPoints, pDC->drawId);
+    RDTSC_BEGIN(FEClipPoints, pDC->drawId);
 
     enum { VERTS_PER_PRIM = 1 };
 
@@ -236,7 +236,7 @@ void SIMDCALL ClipPoints_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t worker
     pa.useAlternateOffset = false;
     clipper.ExecuteStage(pa, prims, primMask, primId, viewportIdx, rtIdx);
 
-    AR_END(FEClipPoints, 1);
+    RDTSC_END(FEClipPoints, 1);
 }
 
 #endif
diff --git a/src/gallium/drivers/swr/rasterizer/core/clip.h b/src/gallium/drivers/swr/rasterizer/core/clip.h
index 592c9bf..cda40f1 100644
--- a/src/gallium/drivers/swr/rasterizer/core/clip.h
+++ b/src/gallium/drivers/swr/rasterizer/core/clip.h
@@ -719,11 +719,11 @@ public:
 
         if (clipMask)
         {
-            AR_BEGIN(FEGuardbandClip, pa.pDC->drawId);
+            RDTSC_BEGIN(FEGuardbandClip, pa.pDC->drawId);
             // we have to clip tris, execute the clipper, which will also
             // call the binner
             ClipSimd(prim, SIMD_T::vmask_ps(primMask), SIMD_T::vmask_ps(clipMask), pa, primId, viewportIdx, rtIdx);
-            AR_END(FEGuardbandClip, 1);
+            RDTSC_END(FEGuardbandClip, 1);
         }
         else if (validMask)
         {
diff --git a/src/gallium/drivers/swr/rasterizer/core/context.h b/src/gallium/drivers/swr/rasterizer/core/context.h
index 6a63838..5bae53f 100644
--- a/src/gallium/drivers/swr/rasterizer/core/context.h
+++ b/src/gallium/drivers/swr/rasterizer/core/context.h
@@ -526,30 +526,25 @@ struct SWR_CONTEXT
 #define AR_WORKER_CTX  pContext->pArContext[workerId]
 #define AR_API_CTX     pContext->pArContext[pContext->NumWorkerThreads]
 
+#ifdef KNOB_ENABLE_RDTSC
+#define RDTSC_BEGIN(type, drawid) RDTSC_START(type)
+#define RDTSC_END(type, count)   RDTSC_STOP(type, count, 0)
+#else
+#define RDTSC_BEGIN(type, count)
+#define RDTSC_END(type, count)
+#endif
+
 #ifdef KNOB_ENABLE_AR
-    #define _AR_BEGIN(ctx, type, id)    ArchRast::Dispatch(ctx, ArchRast::Start(ArchRast::type, id))
-    #define _AR_END(ctx, type, count)   ArchRast::Dispatch(ctx, ArchRast::End(ArchRast::type, count))
     #define _AR_EVENT(ctx, event)       ArchRast::Dispatch(ctx, ArchRast::event)
     #define _AR_FLUSH(ctx, id)          ArchRast::FlushDraw(ctx, id)
 #else
-    #ifdef KNOB_ENABLE_RDTSC
-        #define _AR_BEGIN(ctx, type, id) (void)ctx; RDTSC_START(type)
-        #define _AR_END(ctx, type, id)   RDTSC_STOP(type, id, 0)
-    #else
-        #define _AR_BEGIN(ctx, type, id) (void)ctx
-        #define _AR_END(ctx, type, id)
-    #endif
     #define _AR_EVENT(ctx, event)
     #define _AR_FLUSH(ctx, id)
 #endif
 
 // Use these macros for api thread.
-#define AR_API_BEGIN(type, id) _AR_BEGIN(AR_API_CTX, type, id)
-#define AR_API_END(type, count) _AR_END(AR_API_CTX, type, count)
 #define AR_API_EVENT(event) _AR_EVENT(AR_API_CTX, event)
 
 // Use these macros for worker threads.
-#define AR_BEGIN(type, id) _AR_BEGIN(AR_WORKER_CTX, type, id)
-#define AR_END(type, count) _AR_END(AR_WORKER_CTX, type, count)
 #define AR_EVENT(event) _AR_EVENT(AR_WORKER_CTX, event)
 #define AR_FLUSH(id) _AR_FLUSH(AR_WORKER_CTX, id)
diff --git a/src/gallium/drivers/swr/rasterizer/core/frontend.cpp b/src/gallium/drivers/swr/rasterizer/core/frontend.cpp
index 66c4b74..a9b1372 100644
--- a/src/gallium/drivers/swr/rasterizer/core/frontend.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/frontend.cpp
@@ -150,7 +150,7 @@ void ProcessStoreTiles(
     uint32_t workerId,
     void *pUserData)
 {
-    AR_BEGIN(FEProcessStoreTiles, pDC->drawId);
+    RDTSC_BEGIN(FEProcessStoreTiles, pDC->drawId);
     MacroTileMgr *pTileMgr = pDC->pTileMgr;
     STORE_TILES_DESC* pDesc = (STORE_TILES_DESC*)pUserData;
 
@@ -175,7 +175,7 @@ void ProcessStoreTiles(
         }
     }
 
-    AR_END(FEProcessStoreTiles, 0);
+    RDTSC_END(FEProcessStoreTiles, 0);
 }
 
 //////////////////////////////////////////////////////////////////////////
@@ -191,7 +191,7 @@ void ProcessDiscardInvalidateTiles(
     uint32_t workerId,
     void *pUserData)
 {
-    AR_BEGIN(FEProcessInvalidateTiles, pDC->drawId);
+    RDTSC_BEGIN(FEProcessInvalidateTiles, pDC->drawId);
     DISCARD_INVALIDATE_TILES_DESC *pDesc = (DISCARD_INVALIDATE_TILES_DESC*)pUserData;
     MacroTileMgr *pTileMgr = pDC->pTileMgr;
 
@@ -230,7 +230,7 @@ void ProcessDiscardInvalidateTiles(
         }
     }
 
-    AR_END(FEProcessInvalidateTiles, 0);
+    RDTSC_END(FEProcessInvalidateTiles, 0);
 }
 
 //////////////////////////////////////////////////////////////////////////
@@ -507,7 +507,7 @@ static void StreamOut(
 {
     SWR_CONTEXT *pContext = pDC->pContext;
 
-    AR_BEGIN(FEStreamout, pDC->drawId);
+    RDTSC_BEGIN(FEStreamout, pDC->drawId);
 
     const API_STATE& state = GetApiState(pDC);
     const SWR_STREAMOUT_STATE &soState = state.soState;
@@ -582,7 +582,7 @@ static void StreamOut(
     UPDATE_STAT_FE(SoPrimStorageNeeded[streamIndex], soContext.numPrimStorageNeeded);
     UPDATE_STAT_FE(SoNumPrimsWritten[streamIndex], soContext.numPrimsWritten);
 
-    AR_END(FEStreamout, 1);
+    RDTSC_END(FEStreamout, 1);
 }
 
 #if USE_SIMD16_FRONTEND
@@ -801,7 +801,7 @@ static void GeometryShaderStage(
 {
     SWR_CONTEXT *pContext = pDC->pContext;
 
-    AR_BEGIN(FEGeometryShader, pDC->drawId);
+    RDTSC_BEGIN(FEGeometryShader, pDC->drawId);
 
     const API_STATE& state = GetApiState(pDC);
     const SWR_GS_STATE* pState = &state.gsState;
@@ -1073,7 +1073,7 @@ static void GeometryShaderStage(
     UPDATE_STAT_FE(GsInvocations, numInputPrims * pState->instanceCount);
     UPDATE_STAT_FE(GsPrimitives, totalPrimsGenerated);
     AR_EVENT(GSPrimInfo(numInputPrims, totalPrimsGenerated, numVertsPerPrim*numInputPrims));
-    AR_END(FEGeometryShader, 1);
+    RDTSC_END(FEGeometryShader, 1);
 }
 
 //////////////////////////////////////////////////////////////////////////
@@ -1253,9 +1253,9 @@ static void TessellationStages(
     hsContext.mask = GenerateMask(numPrims);
 
     // Run the HS
-    AR_BEGIN(FEHullShader, pDC->drawId);
+    RDTSC_BEGIN(FEHullShader, pDC->drawId);
     state.pfnHsFunc(GetPrivateState(pDC), &hsContext);
-    AR_END(FEHullShader, 0);
+    RDTSC_END(FEHullShader, 0);
 
     UPDATE_STAT_FE(HsInvocations, numPrims);
 
@@ -1265,10 +1265,10 @@ static void TessellationStages(
     {
         // Run Tessellator
         SWR_TS_TESSELLATED_DATA tsData = { 0 };
-        AR_BEGIN(FETessellation, pDC->drawId);
+        RDTSC_BEGIN(FETessellation, pDC->drawId);
         TSTessellate(tsCtx, hsContext.pCPout[p].tessFactors, tsData);
         AR_EVENT(TessPrimCount(1));
-        AR_END(FETessellation, 0);
+        RDTSC_END(FETessellation, 0);
 
         if (tsData.NumPrimitives == 0)
         {
@@ -1317,9 +1317,9 @@ static void TessellationStages(
         {
             dsContext.mask = GenerateMask(tsData.NumDomainPoints - dsInvocations);
 
-            AR_BEGIN(FEDomainShader, pDC->drawId);
+            RDTSC_BEGIN(FEDomainShader, pDC->drawId);
             state.pfnDsFunc(GetPrivateState(pDC), &dsContext);
-            AR_END(FEDomainShader, 0);
+            RDTSC_END(FEDomainShader, 0);
 
             dsInvocations += KNOB_SIMD_WIDTH;
         }
@@ -1390,14 +1390,14 @@ static void TessellationStages(
 #else
                     simdvector      prim[3];        // Only deal with triangles, lines, or points
 #endif
-                    AR_BEGIN(FEPAAssemble, pDC->drawId);
+                    RDTSC_BEGIN(FEPAAssemble, pDC->drawId);
                     bool assemble =
 #if USE_SIMD16_FRONTEND
                         tessPa.Assemble(VERTEX_POSITION_SLOT, prim_simd16);
 #else
                         tessPa.Assemble(VERTEX_POSITION_SLOT, prim);
 #endif
-                    AR_END(FEPAAssemble, 1);
+                    RDTSC_END(FEPAAssemble, 1);
                     SWR_ASSERT(assemble);
 
                     SWR_ASSERT(pfnClipFunc);
@@ -1520,7 +1520,7 @@ void ProcessDraw(
     }
 #endif
 
-    AR_BEGIN(FEProcessDraw, pDC->drawId);
+    RDTSC_BEGIN(FEProcessDraw, pDC->drawId);
 
     DRAW_WORK&          work = *(DRAW_WORK*)pUserData;
     const API_STATE&    state = GetApiState(pDC);
@@ -1725,7 +1725,7 @@ void ProcessDraw(
             if (i < endVertex)
             {
                 // 1. Execute FS/VS for a single SIMD.
-                AR_BEGIN(FEFetchShader, pDC->drawId);
+                RDTSC_BEGIN(FEFetchShader, pDC->drawId);
 #if USE_SIMD16_SHADERS
                 state.pfnFetchFunc(GetPrivateState(pDC), fetchInfo_lo, vin);
 #else
@@ -1736,7 +1736,7 @@ void ProcessDraw(
                     state.pfnFetchFunc(GetPrivateState(pDC), fetchInfo_hi, vin_hi);
                 }
 #endif
-                AR_END(FEFetchShader, 0);
+                RDTSC_END(FEFetchShader, 0);
 
                 // forward fetch generated vertex IDs to the vertex shader
 #if USE_SIMD16_SHADERS
@@ -1780,7 +1780,7 @@ void ProcessDraw(
                 if (!KNOB_TOSS_FETCH)
 #endif
                 {
-                    AR_BEGIN(FEVertexShader, pDC->drawId);
+                    RDTSC_BEGIN(FEVertexShader, pDC->drawId);
 #if USE_SIMD16_VS
                     state.pfnVertexFunc(GetPrivateState(pDC), &vsContext_lo);
 #else
@@ -1791,7 +1791,7 @@ void ProcessDraw(
                         state.pfnVertexFunc(GetPrivateState(pDC), &vsContext_hi);
                     }
 #endif
-                    AR_END(FEVertexShader, 0);
+                    RDTSC_END(FEVertexShader, 0);
 
                     UPDATE_STAT_FE(VsInvocations, GetNumInvocations(i, endVertex));
                 }
@@ -1979,9 +1979,9 @@ void ProcessDraw(
             {
 
                 // 1. Execute FS/VS for a single SIMD.
-                AR_BEGIN(FEFetchShader, pDC->drawId);
+                RDTSC_BEGIN(FEFetchShader, pDC->drawId);
                 state.pfnFetchFunc(GetPrivateState(pDC), fetchInfo, vout);
-                AR_END(FEFetchShader, 0);
+                RDTSC_END(FEFetchShader, 0);
 
                 // forward fetch generated vertex IDs to the vertex shader
                 vsContext.VertexID = fetchInfo.VertexID;
@@ -2001,9 +2001,9 @@ void ProcessDraw(
                 if (!KNOB_TOSS_FETCH)
 #endif
                 {
-                    AR_BEGIN(FEVertexShader, pDC->drawId);
+                    RDTSC_BEGIN(FEVertexShader, pDC->drawId);
                     state.pfnVertexFunc(GetPrivateState(pDC), &vsContext);
-                    AR_END(FEVertexShader, 0);
+                    RDTSC_END(FEVertexShader, 0);
 
                     UPDATE_STAT_FE(VsInvocations, GetNumInvocations(i, endVertex));
                 }
@@ -2014,9 +2014,9 @@ void ProcessDraw(
             {
                 simdvector prim[MAX_NUM_VERTS_PER_PRIM];
                 // PaAssemble returns false if there is not enough verts to assemble.
-                AR_BEGIN(FEPAAssemble, pDC->drawId);
+                RDTSC_BEGIN(FEPAAssemble, pDC->drawId);
                 bool assemble = pa.Assemble(VERTEX_POSITION_SLOT, prim);
-                AR_END(FEPAAssemble, 1);
+                RDTSC_END(FEPAAssemble, 1);
 
 #if KNOB_ENABLE_TOSS_POINTS
                 if (!KNOB_TOSS_FETCH)
@@ -2104,7 +2104,7 @@ void ProcessDraw(
 
 #endif
 
-    AR_END(FEProcessDraw, numPrims * work.numInstances);
+    RDTSC_END(FEProcessDraw, numPrims * work.numInstances);
 }
 
 struct FEDrawChooser
diff --git a/src/gallium/drivers/swr/rasterizer/core/rasterizer.cpp b/src/gallium/drivers/swr/rasterizer/core/rasterizer.cpp
index ae1e9c7..6c5f17d 100644
--- a/src/gallium/drivers/swr/rasterizer/core/rasterizer.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/rasterizer.cpp
@@ -53,7 +53,7 @@ void RasterizeLine(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, voi
 #endif
 
     // bloat line to two tris and call the triangle rasterizer twice
-    AR_BEGIN(BERasterizeLine, pDC->drawId);
+    RDTSC_BEGIN(BERasterizeLine, pDC->drawId);
 
     const API_STATE &state = GetApiState(pDC);
     const SWR_RASTSTATE &rastState = state.rastState;
@@ -246,7 +246,7 @@ void RasterizeLine(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, voi
         pfnTriRast(pDC, workerId, macroTile, (void*)&newWorkDesc);
     }
 
-    AR_END(BERasterizeLine, 1);
+    RDTSC_BEGIN(BERasterizeLine, 1);
 }
 
 void RasterizeSimplePoint(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void* pData)
@@ -308,9 +308,9 @@ void RasterizeSimplePoint(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTi
     GetRenderHotTiles(pDC, macroTile, tileAlignedX >> KNOB_TILE_X_DIM_SHIFT , tileAlignedY >> KNOB_TILE_Y_DIM_SHIFT, 
         renderBuffers, triDesc.triFlags.renderTargetArrayIndex);
 
-    AR_BEGIN(BEPixelBackend, pDC->drawId);
+    RDTSC_BEGIN(BEPixelBackend, pDC->drawId);
     backendFuncs.pfnBackend(pDC, workerId, tileAlignedX, tileAlignedY, triDesc, renderBuffers);
-    AR_END(BEPixelBackend, 0);
+    RDTSC_END(BEPixelBackend, 0);
 }
 
 void RasterizeTriPoint(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void* pData)
diff --git a/src/gallium/drivers/swr/rasterizer/core/rasterizer_impl.h b/src/gallium/drivers/swr/rasterizer/core/rasterizer_impl.h
index 081e4dd..6dba1b6 100644
--- a/src/gallium/drivers/swr/rasterizer/core/rasterizer_impl.h
+++ b/src/gallium/drivers/swr/rasterizer/core/rasterizer_impl.h
@@ -781,9 +781,9 @@ struct GenerateSVInnerCoverage<RT, AllEdgesValidT, InnerConservativeCoverageT>
         }
 
         // not trivial accept or reject, must rasterize full tile
-        AR_BEGIN(BERasterizePartial, pDC->drawId);
+        RDTSC_BEGIN(BERasterizePartial, pDC->drawId);
         innerCoverageMask = rasterizePartialTile<RT::NumEdgesT::value, typename RT::ValidEdgeMaskT>(pDC, startQuadEdgesAdj, pRastEdges);
-        AR_END(BERasterizePartial, 0);
+        RDTSC_END(BERasterizePartial, 0);
     }
 };
 
@@ -847,8 +847,8 @@ void RasterizeTriangle(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile,
         return;
     }
 #endif
-    AR_BEGIN(BERasterizeTriangle, pDC->drawId);
-    AR_BEGIN(BETriangleSetup, pDC->drawId);
+    RDTSC_BEGIN(BERasterizeTriangle, pDC->drawId);
+    RDTSC_BEGIN(BETriangleSetup, pDC->drawId);
 
     const API_STATE &state = GetApiState(pDC);
     const SWR_RASTSTATE &rastState = state.rastState;
@@ -1014,7 +1014,7 @@ void RasterizeTriangle(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile,
 
     SWR_ASSERT(intersect.xmin <= intersect.xmax && intersect.ymin <= intersect.ymax && intersect.xmin >= 0 && intersect.xmax >= 0 && intersect.ymin >= 0 && intersect.ymax >= 0);
 
-    AR_END(BETriangleSetup, 0);
+    RDTSC_END(BETriangleSetup, 0);
 
     // update triangle desc
     uint32_t minTileX = intersect.xmin >> (KNOB_TILE_X_DIM_SHIFT + FIXED_POINT_SHIFT);
@@ -1027,11 +1027,11 @@ void RasterizeTriangle(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile,
     if (numTilesX == 0 || numTilesY == 0) 
     {
         RDTSC_EVENT(BEEmptyTriangle, 1, 0);
-        AR_END(BERasterizeTriangle, 1);
+        RDTSC_END(BERasterizeTriangle, 1);
         return;
     }
 
-    AR_BEGIN(BEStepSetup, pDC->drawId);
+    RDTSC_BEGIN(BEStepSetup, pDC->drawId);
 
     // Step to pixel center of top-left pixel of the triangle bbox
     // Align intersect bbox (top/left) to raster tile's (top/left).
@@ -1140,7 +1140,7 @@ void RasterizeTriangle(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile,
         }
     }
 
-    AR_END(BEStepSetup, 0);
+    RDTSC_END(BEStepSetup, 0);
 
     uint32_t tY = minTileY;
     uint32_t tX = minTileX;
@@ -1233,9 +1233,9 @@ void RasterizeTriangle(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile,
                         }
 
                         // not trivial accept or reject, must rasterize full tile
-                        AR_BEGIN(BERasterizePartial, pDC->drawId);
+                        RDTSC_BEGIN(BERasterizePartial, pDC->drawId);
                         triDesc.coverageMask[sampleNum] = rasterizePartialTile<RT::NumEdgesT::value, typename RT::ValidEdgeMaskT>(pDC, startQuadEdges, rastEdges);
-                        AR_END(BERasterizePartial, 0);
+                        RDTSC_END(BERasterizePartial, 0);
 
                         triDesc.anyCoveredSamples |= triDesc.coverageMask[sampleNum]; 
                         
@@ -1271,9 +1271,9 @@ void RasterizeTriangle(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile,
                     UnrollerL<1, RT::MT::numSamples, 1>::step(copyCoverage);
                 }
 
-                AR_BEGIN(BEPixelBackend, pDC->drawId);
+                RDTSC_BEGIN(BEPixelBackend, pDC->drawId);
                 backendFuncs.pfnBackend(pDC, workerId, tileX << KNOB_TILE_X_DIM_SHIFT, tileY << KNOB_TILE_Y_DIM_SHIFT, triDesc, renderBuffers);
-                AR_END(BEPixelBackend, 0);
+                RDTSC_END(BEPixelBackend, 0);
             }
 
             // step to the next tile in X
@@ -1292,7 +1292,7 @@ void RasterizeTriangle(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile,
         StepRasterTileY<RT>(state.colorHottileEnable, renderBuffers, currentRenderBufferRow);
     }
 
-    AR_END(BERasterizeTriangle, 1);
+    RDTSC_END(BERasterizeTriangle, 1);
 }
 
 // Get pointers to hot tile memory for color RT, depth, stencil
diff --git a/src/gallium/drivers/swr/rasterizer/core/threads.cpp b/src/gallium/drivers/swr/rasterizer/core/threads.cpp
index d684ffe..4d79168 100644
--- a/src/gallium/drivers/swr/rasterizer/core/threads.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/threads.cpp
@@ -541,7 +541,7 @@ bool WorkOnFifoBE(
             {
                 BE_WORK *pWork;
 
-                AR_BEGIN(WorkerFoundWork, pDC->drawId);
+                RDTSC_BEGIN(WorkerFoundWork, pDC->drawId);
 
                 uint32_t numWorkItems = tile->getNumQueued();
                 SWR_ASSERT(numWorkItems);
@@ -562,7 +562,7 @@ bool WorkOnFifoBE(
                     pWork->pfnWork(pDC, workerId, tileID, &pWork->desc);
                     tile->dequeue();
                 }
-                AR_END(WorkerFoundWork, numWorkItems);
+                RDTSC_END(WorkerFoundWork, numWorkItems);
 
                 _ReadWriteBarrier();
 
@@ -849,9 +849,9 @@ DWORD workerThreadMain(LPVOID pData)
 
         if (IsBEThread)
         {
-            AR_BEGIN(WorkerWorkOnFifoBE, 0);
+            RDTSC_BEGIN(WorkerWorkOnFifoBE, 0);
             bShutdown |= WorkOnFifoBE(pContext, workerId, curDrawBE, lockedTiles, numaNode, numaMask);
-            AR_END(WorkerWorkOnFifoBE, 0);
+            RDTSC_END(WorkerWorkOnFifoBE, 0);
 
             WorkOnCompute(pContext, workerId, curDrawBE);
         }
diff --git a/src/gallium/drivers/swr/rasterizer/core/tilemgr.cpp b/src/gallium/drivers/swr/rasterizer/core/tilemgr.cpp
index 3ade6e4..f468670 100644
--- a/src/gallium/drivers/swr/rasterizer/core/tilemgr.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/tilemgr.cpp
@@ -396,19 +396,19 @@ void HotTileMgr::InitializeHotTiles(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, ui
 
         if (pHotTile->state == HOTTILE_INVALID)
         {
-            AR_BEGIN(BELoadTiles, pDC->drawId);
+            RDTSC_BEGIN(BELoadTiles, pDC->drawId);
             // invalid hottile before draw requires a load from surface before we can draw to it
             pContext->pfnLoadTile(GetPrivateState(pDC), KNOB_COLOR_HOT_TILE_FORMAT, (SWR_RENDERTARGET_ATTACHMENT)(SWR_ATTACHMENT_COLOR0 + rtSlot), x, y, pHotTile->renderTargetArrayIndex, pHotTile->pBuffer);
             pHotTile->state = HOTTILE_DIRTY;
-            AR_END(BELoadTiles, 0);
+            RDTSC_END(BELoadTiles, 0);
         }
         else if (pHotTile->state == HOTTILE_CLEAR)
         {
-            AR_BEGIN(BELoadTiles, pDC->drawId);
+            RDTSC_BEGIN(BELoadTiles, pDC->drawId);
             // Clear the tile.
             ClearColorHotTile(pHotTile);
             pHotTile->state = HOTTILE_DIRTY;
-            AR_END(BELoadTiles, 0);
+            RDTSC_END(BELoadTiles, 0);
         }
         colorHottileEnableMask &= ~(1 << rtSlot);
     }
@@ -419,19 +419,19 @@ void HotTileMgr::InitializeHotTiles(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, ui
         HOTTILE* pHotTile = GetHotTile(pContext, pDC, macroID, SWR_ATTACHMENT_DEPTH, true, numSamples);
         if (pHotTile->state == HOTTILE_INVALID)
         {
-            AR_BEGIN(BELoadTiles, pDC->drawId);
+            RDTSC_BEGIN(BELoadTiles, pDC->drawId);
             // invalid hottile before draw requires a load from surface before we can draw to it
             pContext->pfnLoadTile(GetPrivateState(pDC), KNOB_DEPTH_HOT_TILE_FORMAT, SWR_ATTACHMENT_DEPTH, x, y, pHotTile->renderTargetArrayIndex, pHotTile->pBuffer);
             pHotTile->state = HOTTILE_DIRTY;
-            AR_END(BELoadTiles, 0);
+            RDTSC_END(BELoadTiles, 0);
         }
         else if (pHotTile->state == HOTTILE_CLEAR)
         {
-            AR_BEGIN(BELoadTiles, pDC->drawId);
+            RDTSC_BEGIN(BELoadTiles, pDC->drawId);
             // Clear the tile.
             ClearDepthHotTile(pHotTile);
             pHotTile->state = HOTTILE_DIRTY;
-            AR_END(BELoadTiles, 0);
+            RDTSC_END(BELoadTiles, 0);
         }
     }
 
@@ -441,19 +441,19 @@ void HotTileMgr::InitializeHotTiles(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, ui
         HOTTILE* pHotTile = GetHotTile(pContext, pDC, macroID, SWR_ATTACHMENT_STENCIL, true, numSamples);
         if (pHotTile->state == HOTTILE_INVALID)
         {
-            AR_BEGIN(BELoadTiles, pDC->drawId);
+            RDTSC_BEGIN(BELoadTiles, pDC->drawId);
             // invalid hottile before draw requires a load from surface before we can draw to it
             pContext->pfnLoadTile(GetPrivateState(pDC), KNOB_STENCIL_HOT_TILE_FORMAT, SWR_ATTACHMENT_STENCIL, x, y, pHotTile->renderTargetArrayIndex, pHotTile->pBuffer);
             pHotTile->state = HOTTILE_DIRTY;
-            AR_END(BELoadTiles, 0);
+            RDTSC_END(BELoadTiles, 0);
         }
         else if (pHotTile->state == HOTTILE_CLEAR)
         {
-            AR_BEGIN(BELoadTiles, pDC->drawId);
+            RDTSC_BEGIN(BELoadTiles, pDC->drawId);
             // Clear the tile.
             ClearStencilHotTile(pHotTile);
             pHotTile->state = HOTTILE_DIRTY;
-            AR_END(BELoadTiles, 0);
+            RDTSC_END(BELoadTiles, 0);
         }
     }
 }
-- 
2.7.4



More information about the mesa-dev mailing list