Mesa (master): r600/sfn: extract class to handle the VS export to different stages

Tue Apr 28 08:17:40 UTC 2020

Module: Mesa
Branch: master
Commit: f7df2c57a207a386ba0d2130541ac9d0546670e1
URL:    http://cgit.freedesktop.org/mesa/mesa/commit/?id=f7df2c57a207a386ba0d2130541ac9d0546670e1

Author: Gert Wollny <gert.wollny at collabora.com>
Date:   Wed Apr 15 16:40:49 2020 +0200

r600/sfn: extract class to handle the VS export to different stages

This code can be shared with the TESS_EVAL shader

Signed-off-by: Gert Wollny <gert.wollny at collabora.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/4714>

---

 src/gallium/drivers/r600/Makefile.sources          |   4 +-
 src/gallium/drivers/r600/meson.build               |   2 +
 src/gallium/drivers/r600/sfn/sfn_nir.cpp           |  12 +-
 src/gallium/drivers/r600/sfn/sfn_shader_base.h     |   3 +-
 src/gallium/drivers/r600/sfn/sfn_shader_vertex.cpp | 469 +++------------------
 src/gallium/drivers/r600/sfn/sfn_shader_vertex.h   |  53 +--
 .../drivers/r600/sfn/sfn_vertexstageexport.cpp     | 456 ++++++++++++++++++++
 .../drivers/r600/sfn/sfn_vertexstageexport.h       |  85 ++++
 8 files changed, 617 insertions(+), 467 deletions(-)

diff --git a/src/gallium/drivers/r600/Makefile.sources b/src/gallium/drivers/r600/Makefile.sources
index 673b8a0ac4d..45342e4ad21 100644
--- a/src/gallium/drivers/r600/Makefile.sources
+++ b/src/gallium/drivers/r600/Makefile.sources
@@ -152,7 +152,9 @@ CXX_SOURCES = \
 	sfn/sfn_value_gpr.cpp \
 	sfn/sfn_value_gpr.h \
 	sfn/sfn_valuepool.cpp \
-	sfn/sfn_valuepool.h
+	sfn/sfn_valuepool.h \
+	sfn/sfn_vertexstageexport.cpp \
+        sfn/sfn_vertexstageexport.h
 
 R600_GENERATED_FILES = \
 	egd_tables.h
diff --git a/src/gallium/drivers/r600/meson.build b/src/gallium/drivers/r600/meson.build
index fe369a99dab..468a8165dd5 100644
--- a/src/gallium/drivers/r600/meson.build
+++ b/src/gallium/drivers/r600/meson.build
@@ -170,6 +170,8 @@ files_r600 = files(
   'sfn/sfn_value_gpr.h',
   'sfn/sfn_valuepool.cpp',
   'sfn/sfn_valuepool.h',
+  'sfn/sfn_vertexstageexport.cpp',
+  'sfn/sfn_vertexstageexport.h',
   )
 
 egd_tables_h = custom_target(
diff --git a/src/gallium/drivers/r600/sfn/sfn_nir.cpp b/src/gallium/drivers/r600/sfn/sfn_nir.cpp
index 36ea68753e9..e9a598703f3 100644
--- a/src/gallium/drivers/r600/sfn/sfn_nir.cpp
+++ b/src/gallium/drivers/r600/sfn/sfn_nir.cpp
@@ -60,16 +60,8 @@ bool ShaderFromNir::lower(const nir_shader *shader, r600_pipe_shader *pipe_shade
 
    switch (shader->info.stage) {
    case MESA_SHADER_VERTEX:
-      if (key.vs.as_es) {
-         sfn_log << SfnLog::trans << "Start VS for GS\n";
-         impl.reset(new VertexShaderFromNirForGS(pipe_shader, *sel, key, gs_shader));
-      } else if (key.vs.as_ls) {
-         sfn_log << "VS: next type TCS and TES not yet supported\n";
-         return false;
-      } else {
-         sfn_log << SfnLog::trans << "Start VS for FS\n";
-         impl.reset(new VertexShaderFromNirForFS(pipe_shader, *sel, key));
-      }
+      impl.reset(new VertexShaderFromNir(pipe_shader, *sel, key, gs_shader));
+      break;
       break;
    case MESA_SHADER_GEOMETRY:
       sfn_log << SfnLog::trans << "Start GS\n";
diff --git a/src/gallium/drivers/r600/sfn/sfn_shader_base.h b/src/gallium/drivers/r600/sfn/sfn_shader_base.h
index c747cc6db9a..70062db6258 100644
--- a/src/gallium/drivers/r600/sfn/sfn_shader_base.h
+++ b/src/gallium/drivers/r600/sfn/sfn_shader_base.h
@@ -105,11 +105,10 @@ protected:
    bool emit_store_local_shared(nir_intrinsic_instr* instr);
 
    bool emit_barrier(nir_intrinsic_instr* instr);
-   const GPRVector *output_register(unsigned location) const;
 
    bool load_preloaded_value(const nir_dest& dest, int chan, PValue value,
                              bool as_last = true);
-   void add_param_output_reg(int loc, const GPRVector *gpr);
+
    void inc_atomic_file_count();
    std::bitset<8> m_sv_values;
 
diff --git a/src/gallium/drivers/r600/sfn/sfn_shader_vertex.cpp b/src/gallium/drivers/r600/sfn/sfn_shader_vertex.cpp
index 67eb357a134..ba8a583d780 100644
--- a/src/gallium/drivers/r600/sfn/sfn_shader_vertex.cpp
+++ b/src/gallium/drivers/r600/sfn/sfn_shader_vertex.cpp
@@ -38,17 +38,16 @@ using std::priority_queue;
 
 VertexShaderFromNir::VertexShaderFromNir(r600_pipe_shader *sh,
                                          r600_pipe_shader_selector& sel,
-                                         const r600_shader_key& key):
-   ShaderFromNirProcessor (PIPE_SHADER_VERTEX, sel, sh->shader,
-                           sh->scratch_space_needed),
+                                         const r600_shader_key& key,
+                                         struct r600_shader* gs_shader):
+   VertexStage(PIPE_SHADER_VERTEX, sel, sh->shader,
+               sh->scratch_space_needed),
    m_num_clip_dist(0),
    m_last_param_export(nullptr),
    m_last_pos_export(nullptr),
    m_pipe_shader(sh),
    m_enabled_stream_buffers_mask(0),
    m_so_info(&sel.so),
-   m_cur_param(0),
-   m_cur_clip_pos(1),
    m_vertex_id(),
    m_key(key)
 {
@@ -56,6 +55,18 @@ VertexShaderFromNir::VertexShaderFromNir(r600_pipe_shader *sh,
    increment_reserved_registers();
 
    sh_info().atomic_base = key.vs.first_atomic_counter;
+   sh_info().vs_as_gs_a = m_key.vs.as_gs_a;
+
+   if (key.vs.as_es) {
+      sh->shader.vs_as_es = true;
+      m_export_processor.reset(new VertexStageExportForGS(*this, gs_shader));
+   } else if (key.vs.as_ls) {
+      sh->shader.vs_as_ls = true;
+      sfn_log << SfnLog::trans << "Start VS for GS\n";
+      m_export_processor.reset(new VertexStageExportForES(*this));
+   } else {
+      m_export_processor.reset(new VertexStageExportForFS(*this, &sel.so, sh, key));
+   }
 }
 
 bool VertexShaderFromNir::do_process_inputs(nir_variable *input)
@@ -80,6 +91,13 @@ bool VertexShaderFromNir::allocate_reserved_registers()
    m_vertex_id.reset(R0x);
    inject_register(0, 0, m_vertex_id, false);
 
+   if (m_key.vs.as_gs_a || m_sv_values.test(es_primitive_id)) {
+      auto R0z = new GPRValue(0,2);
+      R0x->set_as_input();
+      m_primitive_id.reset(R0z);
+      inject_register(0, 2, m_primitive_id, false);
+   }
+
    if (m_sv_values.test(es_instanceid)) {
       auto R0w = new GPRValue(0,3);
       R0w->set_as_input();
@@ -87,20 +105,22 @@ bool VertexShaderFromNir::allocate_reserved_registers()
       inject_register(0, 3, m_instance_id, false);
    }
 
-   priority_queue<int, std::vector<int>, std::greater<int>>  q;
-   for (auto a: m_param_map) {
-      q.push(a.first);
-   }
 
-   int next_param = 0;
-   while (!q.empty()) {
-      int loc = q.top();
-      q.pop();
-      m_param_map[loc] = next_param++;
+   if (m_sv_values.test(es_rel_patch_id)) {
+      auto R0y = new GPRValue(0,1);
+      R0y->set_as_input();
+      m_rel_vertex_id.reset(R0y);
+      inject_register(0, 1, m_rel_vertex_id, false);
    }
+
    return true;
 }
 
+void VertexShaderFromNir::emit_shader_start()
+{
+   m_export_processor->setup_paramn_map();
+}
+
 bool VertexShaderFromNir::scan_sysvalue_access(nir_instr *instr)
 {
    switch (instr->type) {
@@ -135,48 +155,28 @@ bool VertexShaderFromNir::emit_intrinsic_instruction_override(nir_intrinsic_inst
    }
 }
 
-bool VertexShaderFromNir::do_process_outputs(nir_variable *output)
+bool VertexShaderFromNir::emit_store_local_shared(nir_intrinsic_instr* instr)
 {
-   if (output->data.location == VARYING_SLOT_COL0 ||
-       output->data.location == VARYING_SLOT_COL1 ||
-       (output->data.location >= VARYING_SLOT_VAR0 &&
-       output->data.location <= VARYING_SLOT_VAR31) ||
-       (output->data.location >= VARYING_SLOT_TEX0 &&
-        output->data.location <= VARYING_SLOT_TEX7) ||
-       output->data.location == VARYING_SLOT_BFC0 ||
-       output->data.location == VARYING_SLOT_BFC1 ||
-       output->data.location == VARYING_SLOT_CLIP_VERTEX ||
-       output->data.location == VARYING_SLOT_CLIP_DIST0 ||
-       output->data.location == VARYING_SLOT_CLIP_DIST1 ||
-       output->data.location == VARYING_SLOT_POS ||
-       output->data.location == VARYING_SLOT_PSIZ ||
-       output->data.location == VARYING_SLOT_FOGC ||
-       output->data.location == VARYING_SLOT_LAYER ||
-       output->data.location == VARYING_SLOT_EDGE ||
-       output->data.location == VARYING_SLOT_VIEWPORT
-       ) {
+   unsigned write_mask = nir_intrinsic_write_mask(instr);
 
-      r600_shader_io& io = sh_info().output[output->data.driver_location];
-      tgsi_get_gl_varying_semantic(static_cast<gl_varying_slot>( output->data.location),
-                                   true, &io.name, &io.sid);
-      if (! m_key.vs.as_es)
-         evaluate_spi_sid(io);
-      ++sh_info().noutput;
+   auto address = from_nir(instr->src[1], 0);
+   int swizzle_base = (write_mask & 0x3) ? 0 : 2;
+   write_mask |= write_mask >> 2;
 
-      if (output->data.location == VARYING_SLOT_PSIZ ||
-          output->data.location == VARYING_SLOT_EDGE ||
-          output->data.location == VARYING_SLOT_LAYER)
-            m_cur_clip_pos = 2;
+   auto value =  from_nir(instr->src[0], swizzle_base);
+   if (!(write_mask & 2)) {
+      emit_instruction(new LDSWriteInstruction(address, 1, value));
+   } else {
+      auto value1 =  from_nir(instr->src[0], swizzle_base + 1);
+      emit_instruction(new LDSWriteInstruction(address, 1, value, value1));
+   }
 
-      if (output->data.location != VARYING_SLOT_POS &&
-          output->data.location != VARYING_SLOT_EDGE &&
-          output->data.location != VARYING_SLOT_PSIZ &&
-          output->data.location != VARYING_SLOT_CLIP_VERTEX)
-         m_param_map[output->data.location] = m_cur_param++;
+   return true;
+}
 
-      return true;
-   }
-   return false;
+bool VertexShaderFromNir::do_process_outputs(nir_variable *output)
+{
+   return m_export_processor->do_process_outputs(output);
 }
 
 bool VertexShaderFromNir::do_emit_load_deref(const nir_variable *in_var, nir_intrinsic_instr* instr)
@@ -199,375 +199,14 @@ bool VertexShaderFromNir::do_emit_load_deref(const nir_variable *in_var, nir_int
    return false;
 }
 
-bool VertexShaderFromNir::emit_clip_vertices(const nir_variable *out_var, nir_intrinsic_instr* instr)
-{
-   sh_info().cc_dist_mask = 0xff;
-   sh_info().clip_dist_write = 0xff;
-
-   std::unique_ptr<GPRVector> clip_vertex(vec_from_nir_with_fetch_constant(instr->src[1], 0xf, {0,1,2,3}));
-
-   for (int i = 0; i < 4; ++i)
-      sh_info().output[out_var->data.driver_location].write_mask |= 1 << i;
-
-   GPRVector clip_dist[2] = { get_temp_vec4(), get_temp_vec4()};
-
-   for (int i = 0; i < 8; i++) {
-      int oreg = i >> 2;
-      int ochan = i & 3;
-      AluInstruction *ir = nullptr;
-      for (int j = 0; j < 4; j++) {
-         ir = new AluInstruction(op2_dot4_ieee, clip_dist[oreg].reg_i(j), clip_vertex->reg_i(j),
-                                 PValue(new UniformValue(512 + i, j, R600_BUFFER_INFO_CONST_BUFFER)),
-                                 (j == ochan) ? EmitInstruction::write : EmitInstruction::empty);
-         emit_instruction(ir);
-      }
-      ir->set_flag(alu_last_instr);
-   }
-
-   m_last_pos_export = new ExportInstruction(m_cur_clip_pos++, clip_dist[0], ExportInstruction::et_pos);
-   emit_export_instruction(m_last_pos_export);
-
-   m_last_pos_export = new ExportInstruction(m_cur_clip_pos, clip_dist[1], ExportInstruction::et_pos);
-   emit_export_instruction(m_last_pos_export);
-
-   return true;
-}
-
-bool VertexShaderFromNir::emit_varying_pos(const nir_variable *out_var, nir_intrinsic_instr* instr,
-                                           std::array<uint32_t, 4> *swizzle_override)
-{
-   std::array<uint32_t,4> swizzle;
-   uint32_t write_mask = 0;
-
-   if (swizzle_override) {
-      swizzle = *swizzle_override;
-      for (int i = 0; i < 4; ++i) {
-         if (swizzle[i] < 6)
-            write_mask |= 1 << i;
-      }
-   } else {
-      write_mask = nir_intrinsic_write_mask(instr) << out_var->data.location_frac;
-      for (int i = 0; i < 4; ++i)
-         swizzle[i] = ((1 << i) & write_mask) ? i - out_var->data.location_frac : 7;
-   }
-
-   sh_info().output[out_var->data.driver_location].write_mask = write_mask;
-
-   GPRVector *value = vec_from_nir_with_fetch_constant(instr->src[1], write_mask, swizzle);
-   set_output(out_var->data.driver_location, PValue(value));
-
-   int export_slot = 0;
-
-   switch (out_var->data.location) {
-   case VARYING_SLOT_EDGE: {
-      sh_info().vs_out_misc_write = 1;
-      sh_info().vs_out_edgeflag = 1;
-      emit_instruction(op1_mov, value->reg_i(1), {value->reg_i(1)}, {alu_write, alu_dst_clamp, alu_last_instr});
-      emit_instruction(op1_flt_to_int, value->reg_i(1), {value->reg_i(1)}, {alu_write, alu_last_instr});
-      sh_info().output[out_var->data.driver_location].write_mask = 0xf;
-   }
-      /* fallthrough */
-   case VARYING_SLOT_PSIZ:
-   case VARYING_SLOT_LAYER:
-      export_slot = 1;
-      break;
-   case VARYING_SLOT_POS:
-      break;
-   case VARYING_SLOT_CLIP_DIST0:
-   case VARYING_SLOT_CLIP_DIST1:
-      export_slot = m_cur_clip_pos++;
-      break;
-   default:
-      sfn_log << SfnLog::err << __func__ << "Unsupported location "
-              << out_var->data.location << "\n";
-      return false;
-   }
-
-   m_last_pos_export = new ExportInstruction(export_slot, *value, ExportInstruction::et_pos);
-   emit_export_instruction(m_last_pos_export);
-   add_param_output_reg(out_var->data.driver_location, m_last_pos_export->gpr_ptr());
-   return true;
-}
-
-bool VertexShaderFromNir::emit_varying_param(const nir_variable *out_var, nir_intrinsic_instr* instr)
-{
-   assert(out_var->data.driver_location < sh_info().noutput);
-   sfn_log << SfnLog::io << __func__ << ": emit DDL: " << out_var->data.driver_location << "\n";
-
-   int write_mask = nir_intrinsic_write_mask(instr) << out_var->data.location_frac;
-   std::array<uint32_t,4> swizzle;
-   for (int i = 0; i < 4; ++i)
-      swizzle[i] = ((1 << i) & write_mask) ? i - out_var->data.location_frac : 7;
-
-   sh_info().output[out_var->data.driver_location].write_mask = write_mask;
-
-   GPRVector *value = vec_from_nir_with_fetch_constant(instr->src[1], write_mask, swizzle);
-   sh_info().output[out_var->data.driver_location].gpr = value->sel();
-
-   /* This should use the registers!! */
-   set_output(out_var->data.driver_location, PValue(value));
-
-   auto param_loc = m_param_map.find(out_var->data.location);
-   assert(param_loc != m_param_map.end());
-
-   m_last_param_export = new ExportInstruction(param_loc->second, *value, ExportInstruction::et_param);
-   emit_export_instruction(m_last_param_export);
-   add_param_output_reg(out_var->data.driver_location, m_last_param_export->gpr_ptr());
-   return true;
-}
-
-bool VertexShaderFromNir::emit_stream(int stream)
-{
-   assert(m_so_info);
-   if (m_so_info->num_outputs > PIPE_MAX_SO_OUTPUTS) {
-           R600_ERR("Too many stream outputs: %d\n", m_so_info->num_outputs);
-           return false;
-   }
-   for (unsigned i = 0; i < m_so_info->num_outputs; i++) {
-           if (m_so_info->output[i].output_buffer >= 4) {
-                   R600_ERR("Exceeded the max number of stream output buffers, got: %d\n",
-                            m_so_info->output[i].output_buffer);
-                   return false;
-           }
-   }
-   const GPRVector *so_gpr[PIPE_MAX_SHADER_OUTPUTS];
-   unsigned start_comp[PIPE_MAX_SHADER_OUTPUTS];
-   std::vector<GPRVector> tmp(m_so_info->num_outputs);
-
-   /* Initialize locations where the outputs are stored. */
-   for (unsigned i = 0; i < m_so_info->num_outputs; i++) {
-      if (stream != -1 && stream != m_so_info->output[i].stream)
-         continue;
-
-      sfn_log << SfnLog::instr << "Emit stream " << i
-              << " with register index " << m_so_info->output[i].register_index << "  so_gpr:";
-
-
-      so_gpr[i] = output_register(m_so_info->output[i].register_index);
-
-      if (!so_gpr[i]) {
-         sfn_log << SfnLog::err << "\nERR: register index "
-                 << m_so_info->output[i].register_index
-                 << " doesn't correspond to an output register\n";
-         return false;
-      }
-      start_comp[i] = m_so_info->output[i].start_component;
-      /* Lower outputs with dst_offset < start_component.
-       *
-       * We can only output 4D vectors with a write mask, e.g. we can
-       * only output the W component at offset 3, etc. If we want
-       * to store Y, Z, or W at buffer offset 0, we need to use MOV
-       * to move it to X and output X. */
-      if (m_so_info->output[i].dst_offset < m_so_info->output[i].start_component) {
-         int tmp_index = allocate_temp_register();
-         int sc = m_so_info->output[i].start_component;
-         AluInstruction *alu = nullptr;
-         for (int j = 0; j < m_so_info->output[i].num_components; j++) {
-            PValue dst(new GPRValue(tmp_index, j));
-            alu = new AluInstruction(op1_mov, dst, so_gpr[i]->reg_i(j + sc), {alu_write});
-            tmp[i].set_reg_i(j, dst);
-            emit_instruction(alu);
-         }
-         if (alu)
-            alu->set_flag(alu_last_instr);
-
-         /* Fill the vector with masked values */
-         PValue dst_blank(new GPRValue(tmp_index, 7));
-         for (int j = m_so_info->output[i].num_components; j < 4; j++)
-            tmp[i].set_reg_i(j, dst_blank);
-
-         start_comp[i] = 0;
-         so_gpr[i] = &tmp[i];
-      }
-      sfn_log << SfnLog::instr <<  *so_gpr[i] << "\n";
-   }
-
-   /* Write outputs to buffers. */
-   for (unsigned i = 0; i < m_so_info->num_outputs; i++) {
-      sfn_log << SfnLog::instr << "Write output buffer " << i
-              << " with register index " << m_so_info->output[i].register_index << "\n";
-
-      StreamOutIntruction *out_stream =
-            new StreamOutIntruction(*so_gpr[i],
-                                    m_so_info->output[i].num_components,
-                                    m_so_info->output[i].dst_offset - start_comp[i],
-                                    ((1 << m_so_info->output[i].num_components) - 1) << start_comp[i],
-                                    m_so_info->output[i].output_buffer,
-                                    m_so_info->output[i].stream);
-      emit_export_instruction(out_stream);
-      m_enabled_stream_buffers_mask |= (1 << m_so_info->output[i].output_buffer) << m_so_info->output[i].stream * 4;
-   }
-   return true;
-}
-
 void VertexShaderFromNir::do_finalize()
 {
-   if (m_key.vs.as_gs_a) {
-      PValue o(new GPRValue(0,PIPE_SWIZZLE_0));
-      GPRVector primid({PValue(new GPRValue(0,2)), o,o,o});
-      m_last_param_export = new ExportInstruction(m_cur_param, primid, ExportInstruction::et_param);
-      emit_export_instruction(m_last_param_export);
-      int i;
-      i = sh_info().noutput++;
-      auto& io = sh_info().output[i];
-      io.name = TGSI_SEMANTIC_PRIMID;
-      io.sid = 0;
-      io.gpr = 0;
-      io.interpolate = TGSI_INTERPOLATE_CONSTANT;
-      io.write_mask = 0x4;
-      io.spi_sid = m_key.vs.prim_id_out;
-      sh_info().vs_as_gs_a = 1;
-   }
-
-   finalize_exports();
-}
-
-
-bool VertexShaderFromNirForFS::do_emit_store_deref(const nir_variable *out_var, nir_intrinsic_instr* instr)
-{
-
-   switch (out_var->data.location) {
-   case VARYING_SLOT_PSIZ:
-      sh_info().vs_out_point_size = 1;
-      sh_info().vs_out_misc_write = 1;
-      /* fallthrough */
-   case VARYING_SLOT_POS:
-      return emit_varying_pos(out_var, instr);
-   case VARYING_SLOT_EDGE: {
-      std::array<uint32_t, 4> swizzle_override = {7 ,0, 7, 7};
-      return emit_varying_pos(out_var, instr, &swizzle_override);
-   }
-   case VARYING_SLOT_CLIP_VERTEX:
-      return emit_clip_vertices(out_var, instr);
-   case VARYING_SLOT_CLIP_DIST0:
-   case VARYING_SLOT_CLIP_DIST1:
-      m_num_clip_dist += 4;
-      return emit_varying_param(out_var, instr) && emit_varying_pos(out_var, instr);
-   case VARYING_SLOT_LAYER: {
-      sh_info().vs_out_misc_write = 1;
-      sh_info().vs_out_layer = 1;
-      std::array<uint32_t, 4> swz = {7,7,0,7};
-      return emit_varying_pos(out_var, instr, &swz) &&
-            emit_varying_param(out_var, instr);
-   }
-   case VARYING_SLOT_VIEW_INDEX:
-      return emit_varying_pos(out_var, instr) &&
-            emit_varying_param(out_var, instr);
-
-   default:
-      if (out_var->data.location <= VARYING_SLOT_VAR31 ||
-          (out_var->data.location >= VARYING_SLOT_TEX0 &&
-           out_var->data.location <= VARYING_SLOT_TEX7))
-         return emit_varying_param(out_var, instr);
-   }
-
-   fprintf(stderr, "r600-NIR: Unimplemented store_deref for %d\n",
-           out_var->data.location);
-   return false;
-}
-
-void VertexShaderFromNirForFS::finalize_exports()
-{
-   if (m_so_info && m_so_info->num_outputs)
-      emit_stream(-1);
-
-   m_pipe_shader->enabled_stream_buffers_mask = m_enabled_stream_buffers_mask;
-
-   if (!m_last_param_export) {
-      GPRVector value(0,{7,7,7,7});
-      m_last_param_export = new ExportInstruction(0, value, ExportInstruction::et_param);
-      emit_export_instruction(m_last_param_export);
-   }
-   m_last_param_export->set_last();
-
-   if (!m_last_pos_export) {
-      GPRVector value(0,{7,7,7,7});
-      m_last_pos_export = new ExportInstruction(0, value, ExportInstruction::et_pos);
-      emit_export_instruction(m_last_pos_export);
-   }
-   m_last_pos_export->set_last();
-
-}
-
-VertexShaderFromNirForGS::VertexShaderFromNirForGS(r600_pipe_shader *sh,
-                                                   r600_pipe_shader_selector& sel,
-                                                   const r600_shader_key &key,
-                                                   const r600_shader *gs_shader):
-   VertexShaderFromNir(sh, sel, key),
-   m_gs_shader(gs_shader)
-{
-   sh->shader.vs_as_es = true;
-}
-
-bool VertexShaderFromNirForGS::do_emit_store_deref(const nir_variable *out_var, nir_intrinsic_instr* instr)
-{
-
-   assert(m_gs_shader);
-
-   int ring_offset = -1;
-   const r600_shader_io& out_io = sh_info().output[out_var->data.driver_location];
-
-   sfn_log << SfnLog::io << "check output " << out_var->data.driver_location
-           << " name=" << out_io.name<< " sid=" << out_io.sid << "\n";
-   for (unsigned k = 0; k < m_gs_shader->ninput; ++k) {
-      auto& in_io = m_gs_shader->input[k];
-      sfn_log << SfnLog::io << "  against  " <<  k << " name=" << in_io.name<< " sid=" << in_io.sid << "\n";
-
-      if (in_io.name == out_io.name &&
-          in_io.sid == out_io.sid) {
-         ring_offset = in_io.ring_offset;
-         break;
-      }
-   }
-
-   if (out_var->data.location == VARYING_SLOT_VIEWPORT)
-      return true;
-
-   if (ring_offset == -1) {
-      sfn_log << SfnLog::err << "VS defines output at "
-              << out_var->data.driver_location << "name=" << out_io.name
-              << " sid=" << out_io.sid << " that is not consumed as GS input\n";
-      return true;
-   }
-
-   uint32_t write_mask =  (1 << instr->num_components) - 1;
-
-   std::unique_ptr<GPRVector> value(vec_from_nir_with_fetch_constant(instr->src[1], write_mask,
-                                    swizzle_from_mask(instr->num_components)));
-
-   auto ir = new MemRingOutIntruction(cf_mem_ring, mem_write, *value,
-                                      ring_offset >> 2, 4, PValue());
-   emit_export_instruction(ir);
-
-   sh_info().output[out_var->data.driver_location].write_mask |= write_mask;
-   if (out_var->data.location == VARYING_SLOT_CLIP_DIST0 ||
-       out_var->data.location == VARYING_SLOT_CLIP_DIST1)
-      m_num_clip_dist += 4;
-
-   return true;
-}
-
-void VertexShaderFromNirForGS::finalize_exports()
-{
-}
-
-
-VertexShaderFromNirForES::VertexShaderFromNirForES(r600_pipe_shader *sh,
-                                                   UNUSED const pipe_stream_output_info *so_info,
-                                                   r600_pipe_shader_selector&  sel,
-                                                   const r600_shader_key &key):
-   VertexShaderFromNir(sh, sel, key)
-{
-}
-
-bool VertexShaderFromNirForES::do_emit_store_deref(UNUSED const nir_variable *out_var,
-                                                   UNUSED nir_intrinsic_instr* instr)
-{
-   return false;
+   m_export_processor->finalize_exports();
 }
 
-void VertexShaderFromNirForES::finalize_exports()
+bool VertexShaderFromNir::do_emit_store_deref(const nir_variable *out_var, nir_intrinsic_instr* instr)
 {
+   return m_export_processor->store_deref(out_var, instr);
 }
 
 }
diff --git a/src/gallium/drivers/r600/sfn/sfn_shader_vertex.h b/src/gallium/drivers/r600/sfn/sfn_shader_vertex.h
index fe8e6b1cbd5..f1c38452374 100644
--- a/src/gallium/drivers/r600/sfn/sfn_shader_vertex.h
+++ b/src/gallium/drivers/r600/sfn/sfn_shader_vertex.h
@@ -28,23 +28,21 @@
 #define sfn_vertex_shader_from_nir_h
 
 #include "sfn_shader_base.h"
+#include "sfn_vertexstageexport.h"
 
 namespace r600 {
 
-class VertexShaderFromNir : public ShaderFromNirProcessor {
+class VertexShaderFromNir : public VertexStage {
 public:
    VertexShaderFromNir(r600_pipe_shader *sh,
                        r600_pipe_shader_selector &sel,
-                       const r600_shader_key &key);
+                       const r600_shader_key &key, r600_shader *gs_shader);
 
    bool do_emit_load_deref(const nir_variable *in_var, nir_intrinsic_instr* instr) override;
    bool scan_sysvalue_access(nir_instr *instr) override;
+
+   PValue primitive_id() override {return m_primitive_id;}
 protected:
-   bool emit_varying_pos(const nir_variable *out_var, nir_intrinsic_instr* instr,
-                         std::array<uint32_t, 4> *swizzle_override = nullptr);
-   bool emit_varying_param(const nir_variable *out_var, nir_intrinsic_instr* instr);
-   bool emit_clip_vertices(const nir_variable *out_var, nir_intrinsic_instr* instr);
-   bool emit_stream(int stream);
 
    // todo: encapsulate
    unsigned m_num_clip_dist;
@@ -54,49 +52,26 @@ protected:
    unsigned m_enabled_stream_buffers_mask;
    const pipe_stream_output_info *m_so_info;
    void do_finalize() override;
+
+   std::map<unsigned, unsigned> m_param_map;
 private:
+   bool do_emit_store_deref(const nir_variable *out_var, nir_intrinsic_instr* instr) override;
+   void finalize_exports();
 
+   void emit_shader_start() override;
    bool do_process_inputs(nir_variable *input) override;
    bool allocate_reserved_registers() override;
    bool do_process_outputs(nir_variable *output) override;
    bool emit_intrinsic_instruction_override(nir_intrinsic_instr* instr) override;
-
-   virtual void finalize_exports() = 0;
-
-   unsigned m_cur_param;
-   std::map<unsigned, unsigned> m_param_map;
-   unsigned m_cur_clip_pos;
+   bool emit_store_local_shared(nir_intrinsic_instr* instr);
 
    PValue m_vertex_id;
    PValue m_instance_id;
+   PValue m_rel_vertex_id;
+   PValue m_primitive_id;
    r600_shader_key m_key;
-};
 
-class VertexShaderFromNirForFS :  public VertexShaderFromNir {
-public:
-   using VertexShaderFromNir::VertexShaderFromNir;
-
-   bool do_emit_store_deref(const nir_variable *out_var, nir_intrinsic_instr* instr) override;
-private:
-   void finalize_exports() override;
-};
-
-class VertexShaderFromNirForGS :  public VertexShaderFromNir {
-public:
-   VertexShaderFromNirForGS(r600_pipe_shader *sh, r600_pipe_shader_selector &sel,
-                            const r600_shader_key &key, const r600_shader *gs_shader);
-   bool do_emit_store_deref(const nir_variable *out_var, nir_intrinsic_instr* instr) override;
-   void finalize_exports() override;
-
-   const r600_shader *m_gs_shader;
-};
-
-class VertexShaderFromNirForES :  public VertexShaderFromNir {
-public:
-   VertexShaderFromNirForES(r600_pipe_shader *sh, const pipe_stream_output_info *so_info, r600_pipe_shader_selector &sel,
-                            const r600_shader_key &key);
-   bool do_emit_store_deref(const nir_variable *out_var, nir_intrinsic_instr* instr) override;
-   void finalize_exports() override;
+   std::unique_ptr<VertexStageExportBase> m_export_processor;
 };
 
 }
diff --git a/src/gallium/drivers/r600/sfn/sfn_vertexstageexport.cpp b/src/gallium/drivers/r600/sfn/sfn_vertexstageexport.cpp
new file mode 100644
index 00000000000..08c778a2e10
--- /dev/null
+++ b/src/gallium/drivers/r600/sfn/sfn_vertexstageexport.cpp
@@ -0,0 +1,456 @@
+#include "sfn_vertexstageexport.h"
+
+#include "tgsi/tgsi_from_mesa.h"
+
+namespace r600 {
+
+using std::priority_queue;
+
+VertexStageExportBase::VertexStageExportBase(VertexStage& proc):
+   m_proc(proc),
+   m_cur_clip_pos(1),
+   m_cur_param(0)
+{
+
+}
+
+VertexStageExportBase::~VertexStageExportBase()
+{
+
+}
+
+VertexStageExportForFS::VertexStageExportForFS(VertexStage& proc,
+                                               const pipe_stream_output_info *so_info,
+                                               r600_pipe_shader *pipe_shader, const r600_shader_key &key):
+   VertexStageExportBase(proc),
+   m_last_param_export(nullptr),
+   m_last_pos_export(nullptr),
+   m_num_clip_dist(0),
+   m_enabled_stream_buffers_mask(0),
+   m_so_info(so_info),
+   m_pipe_shader(pipe_shader),
+   m_key(key)
+{
+}
+
+void VertexStageExportBase::setup_paramn_map()
+{
+   priority_queue<int, std::vector<int>, std::greater<int>>  q;
+   for (auto a: m_param_map) {
+      q.push(a.first);
+   }
+
+   int next_param = 0;
+   while (!q.empty()) {
+      int loc = q.top();
+      q.pop();
+      m_param_map[loc] = next_param++;
+   }
+}
+
+bool VertexStageExportBase::do_process_outputs(nir_variable *output)
+{
+   if (output->data.location == VARYING_SLOT_COL0 ||
+       output->data.location == VARYING_SLOT_COL1 ||
+       (output->data.location >= VARYING_SLOT_VAR0 &&
+       output->data.location <= VARYING_SLOT_VAR31) ||
+       (output->data.location >= VARYING_SLOT_TEX0 &&
+        output->data.location <= VARYING_SLOT_TEX7) ||
+       output->data.location == VARYING_SLOT_BFC0 ||
+       output->data.location == VARYING_SLOT_BFC1 ||
+       output->data.location == VARYING_SLOT_CLIP_VERTEX ||
+       output->data.location == VARYING_SLOT_CLIP_DIST0 ||
+       output->data.location == VARYING_SLOT_CLIP_DIST1 ||
+       output->data.location == VARYING_SLOT_POS ||
+       output->data.location == VARYING_SLOT_PSIZ ||
+       output->data.location == VARYING_SLOT_FOGC ||
+       output->data.location == VARYING_SLOT_LAYER ||
+       output->data.location == VARYING_SLOT_EDGE ||
+       output->data.location == VARYING_SLOT_VIEWPORT
+       ) {
+
+      r600_shader_io& io = m_proc.sh_info().output[output->data.driver_location];
+      tgsi_get_gl_varying_semantic(static_cast<gl_varying_slot>( output->data.location),
+                                   true, &io.name, &io.sid);
+
+      m_proc.evaluate_spi_sid(io);
+      io.write_mask = ((1 << glsl_get_components(output->type)) - 1)
+                      << output->data.location_frac;
+      ++m_proc.sh_info().noutput;
+
+      if (output->data.location == VARYING_SLOT_PSIZ ||
+          output->data.location == VARYING_SLOT_EDGE ||
+          output->data.location == VARYING_SLOT_LAYER)
+            m_cur_clip_pos = 2;
+
+      if (output->data.location != VARYING_SLOT_POS &&
+          output->data.location != VARYING_SLOT_EDGE &&
+          output->data.location != VARYING_SLOT_PSIZ &&
+          output->data.location != VARYING_SLOT_CLIP_VERTEX)
+         m_param_map[output->data.location] = m_cur_param++;
+
+      return true;
+   }
+   return false;
+}
+
+
+bool VertexStageExportForFS::store_deref(const nir_variable *out_var, nir_intrinsic_instr* instr)
+{
+
+   switch (out_var->data.location) {
+   case VARYING_SLOT_PSIZ:
+      m_proc.sh_info().vs_out_point_size = 1;
+      m_proc.sh_info().vs_out_misc_write = 1;
+      /* fallthrough */
+   case VARYING_SLOT_POS:
+      return emit_varying_pos(out_var, instr);
+   case VARYING_SLOT_EDGE: {
+      std::array<uint32_t, 4> swizzle_override = {7 ,0, 7, 7};
+      return emit_varying_pos(out_var, instr, &swizzle_override);
+   }
+   case VARYING_SLOT_CLIP_VERTEX:
+      return emit_clip_vertices(out_var, instr);
+   case VARYING_SLOT_CLIP_DIST0:
+   case VARYING_SLOT_CLIP_DIST1:
+      m_num_clip_dist += 4;
+      return emit_varying_param(out_var, instr) && emit_varying_pos(out_var, instr);
+   case VARYING_SLOT_LAYER: {
+      m_proc.sh_info().vs_out_misc_write = 1;
+      m_proc.sh_info().vs_out_layer = 1;
+      std::array<uint32_t, 4> swz = {7,7,0,7};
+      return emit_varying_pos(out_var, instr, &swz) &&
+            emit_varying_param(out_var, instr);
+   }
+   case VARYING_SLOT_VIEW_INDEX:
+      return emit_varying_pos(out_var, instr) &&
+            emit_varying_param(out_var, instr);
+
+   default:
+      if (out_var->data.location <= VARYING_SLOT_VAR31 ||
+          (out_var->data.location >= VARYING_SLOT_TEX0 &&
+           out_var->data.location <= VARYING_SLOT_TEX7))
+         return emit_varying_param(out_var, instr);
+   }
+
+   fprintf(stderr, "r600-NIR: Unimplemented store_deref for %d\n",
+           out_var->data.location);
+   return false;
+}
+
+bool VertexStageExportForFS::emit_varying_pos(const nir_variable *out_var, nir_intrinsic_instr* instr,
+                                              std::array<uint32_t, 4> *swizzle_override)
+{
+   std::array<uint32_t,4> swizzle;
+   uint32_t write_mask = 0;
+
+   if (swizzle_override) {
+      swizzle = *swizzle_override;
+      for (int i = 0; i < 4; ++i) {
+         if (swizzle[i] < 6)
+            write_mask |= 1 << i;
+      }
+   } else {
+      write_mask = nir_intrinsic_write_mask(instr) << out_var->data.location_frac;
+      for (int i = 0; i < 4; ++i)
+         swizzle[i] = ((1 << i) & write_mask) ? i - out_var->data.location_frac : 7;
+   }
+
+   m_proc.sh_info().output[out_var->data.driver_location].write_mask = write_mask;
+
+   GPRVector *value = m_proc.vec_from_nir_with_fetch_constant(instr->src[1], write_mask, swizzle);
+   m_proc.set_output(out_var->data.driver_location, PValue(value));
+
+   int export_slot = 0;
+
+   switch (out_var->data.location) {
+   case VARYING_SLOT_EDGE: {
+      m_proc.sh_info().vs_out_misc_write = 1;
+      m_proc.sh_info().vs_out_edgeflag = 1;
+      m_proc.emit_instruction(op1_mov, value->reg_i(1), {value->reg_i(1)}, {alu_write, alu_dst_clamp, alu_last_instr});
+      m_proc.emit_instruction(op1_flt_to_int, value->reg_i(1), {value->reg_i(1)}, {alu_write, alu_last_instr});
+      m_proc.sh_info().output[out_var->data.driver_location].write_mask = 0xf;
+   }
+      /* fallthrough */
+   case VARYING_SLOT_PSIZ:
+   case VARYING_SLOT_LAYER:
+      export_slot = 1;
+      break;
+   case VARYING_SLOT_POS:
+      break;
+   case VARYING_SLOT_CLIP_DIST0:
+   case VARYING_SLOT_CLIP_DIST1:
+      export_slot = m_cur_clip_pos++;
+      break;
+   default:
+      sfn_log << SfnLog::err << __func__ << "Unsupported location "
+              << out_var->data.location << "\n";
+      return false;
+   }
+
+   m_last_pos_export = new ExportInstruction(export_slot, *value, ExportInstruction::et_pos);
+   m_proc.emit_export_instruction(m_last_pos_export);
+   m_proc.add_param_output_reg(out_var->data.driver_location, m_last_pos_export->gpr_ptr());
+   return true;
+}
+
+bool VertexStageExportForFS::emit_varying_param(const nir_variable *out_var, nir_intrinsic_instr* instr)
+{
+   assert(out_var->data.driver_location < m_proc.sh_info().noutput);
+   sfn_log << SfnLog::io << __func__ << ": emit DDL: " << out_var->data.driver_location << "\n";
+
+   int write_mask = nir_intrinsic_write_mask(instr) << out_var->data.location_frac;
+   std::array<uint32_t,4> swizzle;
+   for (int i = 0; i < 4; ++i)
+      swizzle[i] = ((1 << i) & write_mask) ? i - out_var->data.location_frac : 7;
+
+   m_proc.sh_info().output[out_var->data.driver_location].write_mask = write_mask;
+
+   GPRVector *value = m_proc.vec_from_nir_with_fetch_constant(instr->src[1], write_mask, swizzle);
+   m_proc.sh_info().output[out_var->data.driver_location].gpr = value->sel();
+
+   /* This should use the registers!! */
+   m_proc.set_output(out_var->data.driver_location, PValue(value));
+
+   auto param_loc = m_param_map.find(out_var->data.location);
+   assert(param_loc != m_param_map.end());
+
+   m_last_param_export = new ExportInstruction(param_loc->second, *value, ExportInstruction::et_param);
+   m_proc.emit_export_instruction(m_last_param_export);
+   m_proc.add_param_output_reg(out_var->data.driver_location, m_last_param_export->gpr_ptr());
+   return true;
+}
+
+bool VertexStageExportForFS::emit_clip_vertices(const nir_variable *out_var, nir_intrinsic_instr* instr)
+{
+   m_proc.sh_info().cc_dist_mask = 0xff;
+   m_proc.sh_info().clip_dist_write = 0xff;
+
+   std::unique_ptr<GPRVector> clip_vertex(m_proc.vec_from_nir_with_fetch_constant(instr->src[1], 0xf, {0,1,2,3}));
+
+   for (int i = 0; i < 4; ++i)
+      m_proc.sh_info().output[out_var->data.driver_location].write_mask |= 1 << i;
+
+   GPRVector clip_dist[2] = { m_proc.get_temp_vec4(), m_proc.get_temp_vec4()};
+
+   for (int i = 0; i < 8; i++) {
+      int oreg = i >> 2;
+      int ochan = i & 3;
+      AluInstruction *ir = nullptr;
+      for (int j = 0; j < 4; j++) {
+         ir = new AluInstruction(op2_dot4_ieee, clip_dist[oreg].reg_i(j), clip_vertex->reg_i(j),
+                                 PValue(new UniformValue(512 + i, j, R600_BUFFER_INFO_CONST_BUFFER)),
+                                 (j == ochan) ? EmitInstruction::write : EmitInstruction::empty);
+         m_proc.emit_instruction(ir);
+      }
+      ir->set_flag(alu_last_instr);
+   }
+
+   m_last_pos_export = new ExportInstruction(m_cur_clip_pos++, clip_dist[0], ExportInstruction::et_pos);
+   m_proc.emit_export_instruction(m_last_pos_export);
+
+   m_last_pos_export = new ExportInstruction(m_cur_clip_pos, clip_dist[1], ExportInstruction::et_pos);
+   m_proc.emit_export_instruction(m_last_pos_export);
+
+   return true;
+}
+
+void VertexStageExportForFS::finalize_exports()
+{
+   if (m_key.vs.as_gs_a) {
+      PValue o(new GPRValue(0,PIPE_SWIZZLE_0));
+      GPRVector primid({m_proc.primitive_id(), o,o,o});
+      m_last_param_export = new ExportInstruction(m_cur_param, primid, ExportInstruction::et_param);
+      m_proc.emit_export_instruction(m_last_param_export);
+      int i;
+      i = m_proc.sh_info().noutput++;
+      auto& io = m_proc.sh_info().output[i];
+      io.name = TGSI_SEMANTIC_PRIMID;
+      io.sid = 0;
+      io.gpr = 0;
+      io.interpolate = TGSI_INTERPOLATE_CONSTANT;
+      io.write_mask = 0x1;
+      io.spi_sid = m_key.vs.prim_id_out;
+      m_proc.sh_info().vs_as_gs_a = 1;
+   }
+
+   if (m_so_info && m_so_info->num_outputs)
+      emit_stream(-1);
+
+   m_pipe_shader->enabled_stream_buffers_mask = m_enabled_stream_buffers_mask;
+
+   if (!m_last_param_export) {
+      GPRVector value(0,{7,7,7,7});
+      m_last_param_export = new ExportInstruction(0, value, ExportInstruction::et_param);
+      m_proc.emit_export_instruction(m_last_param_export);
+   }
+   m_last_param_export->set_last();
+
+   if (!m_last_pos_export) {
+      GPRVector value(0,{7,7,7,7});
+      m_last_pos_export = new ExportInstruction(0, value, ExportInstruction::et_pos);
+      m_proc.emit_export_instruction(m_last_pos_export);
+   }
+   m_last_pos_export->set_last();
+}
+
+bool VertexStageExportForFS::emit_stream(int stream)
+{
+   assert(m_so_info);
+   if (m_so_info->num_outputs > PIPE_MAX_SO_OUTPUTS) {
+           R600_ERR("Too many stream outputs: %d\n", m_so_info->num_outputs);
+           return false;
+   }
+   for (unsigned i = 0; i < m_so_info->num_outputs; i++) {
+           if (m_so_info->output[i].output_buffer >= 4) {
+                   R600_ERR("Exceeded the max number of stream output buffers, got: %d\n",
+                            m_so_info->output[i].output_buffer);
+                   return false;
+           }
+   }
+   const GPRVector *so_gpr[PIPE_MAX_SHADER_OUTPUTS];
+   unsigned start_comp[PIPE_MAX_SHADER_OUTPUTS];
+   std::vector<GPRVector> tmp(m_so_info->num_outputs);
+
+   /* Initialize locations where the outputs are stored. */
+   for (unsigned i = 0; i < m_so_info->num_outputs; i++) {
+      if (stream != -1 && stream != m_so_info->output[i].stream)
+         continue;
+
+      sfn_log << SfnLog::instr << "Emit stream " << i
+              << " with register index " << m_so_info->output[i].register_index << "  so_gpr:";
+
+
+      so_gpr[i] = m_proc.output_register(m_so_info->output[i].register_index);
+
+      if (!so_gpr[i]) {
+         sfn_log << SfnLog::err << "\nERR: register index "
+                 << m_so_info->output[i].register_index
+                 << " doesn't correspond to an output register\n";
+         return false;
+      }
+      start_comp[i] = m_so_info->output[i].start_component;
+      /* Lower outputs with dst_offset < start_component.
+       *
+       * We can only output 4D vectors with a write mask, e.g. we can
+       * only output the W component at offset 3, etc. If we want
+       * to store Y, Z, or W at buffer offset 0, we need to use MOV
+       * to move it to X and output X. */
+      if (m_so_info->output[i].dst_offset < m_so_info->output[i].start_component) {
+         int tmp_index = m_proc.allocate_temp_register();
+         int sc = m_so_info->output[i].start_component;
+         AluInstruction *alu = nullptr;
+         for (int j = 0; j < m_so_info->output[i].num_components; j++) {
+            PValue dst(new GPRValue(tmp_index, j));
+            alu = new AluInstruction(op1_mov, dst, so_gpr[i]->reg_i(j + sc), {alu_write});
+            tmp[i].set_reg_i(j, dst);
+            m_proc.emit_instruction(alu);
+         }
+         if (alu)
+            alu->set_flag(alu_last_instr);
+
+         /* Fill the vector with masked values */
+         PValue dst_blank(new GPRValue(tmp_index, 7));
+         for (int j = m_so_info->output[i].num_components; j < 4; j++)
+            tmp[i].set_reg_i(j, dst_blank);
+
+         start_comp[i] = 0;
+         so_gpr[i] = &tmp[i];
+      }
+      sfn_log << SfnLog::instr <<  *so_gpr[i] << "\n";
+   }
+
+   /* Write outputs to buffers. */
+   for (unsigned i = 0; i < m_so_info->num_outputs; i++) {
+      sfn_log << SfnLog::instr << "Write output buffer " << i
+              << " with register index " << m_so_info->output[i].register_index << "\n";
+
+      StreamOutIntruction *out_stream =
+            new StreamOutIntruction(*so_gpr[i],
+                                    m_so_info->output[i].num_components,
+                                    m_so_info->output[i].dst_offset - start_comp[i],
+                                    ((1 << m_so_info->output[i].num_components) - 1) << start_comp[i],
+                                    m_so_info->output[i].output_buffer,
+                                    m_so_info->output[i].stream);
+      m_proc.emit_export_instruction(out_stream);
+      m_enabled_stream_buffers_mask |= (1 << m_so_info->output[i].output_buffer) << m_so_info->output[i].stream * 4;
+   }
+   return true;
+}
+
+
+VertexStageExportForGS::VertexStageExportForGS(VertexStage &proc,
+                                               const r600_shader *gs_shader):
+   VertexStageExportBase(proc),
+   m_gs_shader(gs_shader)
+{
+
+}
+
+bool VertexStageExportForGS::store_deref(const nir_variable *out_var, nir_intrinsic_instr* instr)
+{
+
+   int ring_offset = -1;
+   const r600_shader_io& out_io = m_proc.sh_info().output[out_var->data.driver_location];
+
+   sfn_log << SfnLog::io << "check output " << out_var->data.driver_location
+           << " name=" << out_io.name<< " sid=" << out_io.sid << "\n";
+   for (unsigned k = 0; k < m_gs_shader->ninput; ++k) {
+      auto& in_io = m_gs_shader->input[k];
+      sfn_log << SfnLog::io << "  against  " <<  k << " name=" << in_io.name<< " sid=" << in_io.sid << "\n";
+
+      if (in_io.name == out_io.name &&
+          in_io.sid == out_io.sid) {
+         ring_offset = in_io.ring_offset;
+         break;
+      }
+   }
+
+   if (out_var->data.location == VARYING_SLOT_VIEWPORT)
+      return true;
+
+   if (ring_offset == -1) {
+      sfn_log << SfnLog::err << "VS defines output at "
+              << out_var->data.driver_location << "name=" << out_io.name
+              << " sid=" << out_io.sid << " that is not consumed as GS input\n";
+      return true;
+   }
+
+   uint32_t write_mask =  (1 << instr->num_components) - 1;
+
+   std::unique_ptr<GPRVector> value(m_proc.vec_from_nir_with_fetch_constant(instr->src[1], write_mask,
+                                    swizzle_from_mask(instr->num_components)));
+
+   auto ir = new MemRingOutIntruction(cf_mem_ring, mem_write, *value,
+                                      ring_offset >> 2, 4, PValue());
+   m_proc.emit_export_instruction(ir);
+
+   m_proc.sh_info().output[out_var->data.driver_location].write_mask |= write_mask;
+   if (out_var->data.location == VARYING_SLOT_CLIP_DIST0 ||
+       out_var->data.location == VARYING_SLOT_CLIP_DIST1)
+      m_num_clip_dist += 4;
+
+   return true;
+}
+
+void VertexStageExportForGS::finalize_exports()
+{
+
+}
+
+VertexStageExportForES::VertexStageExportForES(VertexStage& proc):
+   VertexStageExportBase(proc)
+{
+}
+
+bool VertexStageExportForES::store_deref(const nir_variable *out_var, nir_intrinsic_instr* instr)
+{
+   return true;
+}
+
+void VertexStageExportForES::finalize_exports()
+{
+
+}
+
+}
diff --git a/src/gallium/drivers/r600/sfn/sfn_vertexstageexport.h b/src/gallium/drivers/r600/sfn/sfn_vertexstageexport.h
new file mode 100644
index 00000000000..7e2ca620be6
--- /dev/null
+++ b/src/gallium/drivers/r600/sfn/sfn_vertexstageexport.h
@@ -0,0 +1,85 @@
+#ifndef VERTEXSTAGEEXPORT_H
+#define VERTEXSTAGEEXPORT_H
+
+#include "sfn_shader_base.h"
+
+namespace r600 {
+
+class VertexStage : public ShaderFromNirProcessor {
+public:
+   using ShaderFromNirProcessor::ShaderFromNirProcessor;
+
+   virtual PValue primitive_id() = 0;
+};
+
+class VertexStageExportBase
+{
+public:
+   VertexStageExportBase(VertexStage& proc);
+   virtual ~VertexStageExportBase();
+   void setup_paramn_map();
+   virtual bool store_deref(const nir_variable *out_var, nir_intrinsic_instr* instr) = 0;
+   virtual void finalize_exports() = 0;
+   virtual bool do_process_outputs(nir_variable *output);
+   int cur_param() const {return m_cur_param;}
+protected:
+   VertexStage& m_proc;
+   std::map<unsigned, unsigned> m_param_map;
+   int m_cur_clip_pos;
+   int m_cur_param;
+};
+
+class VertexStageExportForFS : public VertexStageExportBase
+{
+public:
+   VertexStageExportForFS(VertexStage& proc,
+                          const pipe_stream_output_info *so_info,
+                          r600_pipe_shader *pipe_shader,
+                          const r600_shader_key& key);
+
+   bool store_deref(const nir_variable *out_var, nir_intrinsic_instr* instr) override;
+   void finalize_exports() override;
+
+private:
+   bool emit_varying_param(const nir_variable *out_var, nir_intrinsic_instr* instr);
+   bool emit_varying_pos(const nir_variable *out_var, nir_intrinsic_instr* instr,
+                         std::array<uint32_t, 4> *swizzle_override = nullptr);
+   bool emit_clip_vertices(const nir_variable *out_var, nir_intrinsic_instr* instr);
+   bool emit_stream(int stream);
+
+   ExportInstruction *m_last_param_export;
+   ExportInstruction *m_last_pos_export;
+
+   int m_num_clip_dist;
+   int m_enabled_stream_buffers_mask;
+   const pipe_stream_output_info *m_so_info;
+   r600_pipe_shader *m_pipe_shader;
+   const r600_shader_key& m_key;
+
+};
+
+class VertexStageExportForGS : public VertexStageExportBase
+{
+public:
+   VertexStageExportForGS(VertexStage& proc,
+                          const r600_shader *gs_shader);
+   bool store_deref(const nir_variable *out_var, nir_intrinsic_instr* instr) override;
+   void finalize_exports() override;
+
+private:
+   unsigned m_num_clip_dist;
+   const r600_shader *m_gs_shader;
+};
+
+class VertexStageExportForES : public VertexStageExportBase
+{
+public:
+   VertexStageExportForES(VertexStage& proc);
+   bool store_deref(const nir_variable *out_var, nir_intrinsic_instr* instr) override;
+   void finalize_exports() override;
+};
+
+
+}
+
+#endif // VERTEXSTAGEEXPORT_H