[PATCH RFC 18/18] drm/asahi: Add the Asahi driver for Apple AGX GPUs

Tue Mar 7 14:25:43 UTC 2023

The `asahi` drm driver supports Apple AGX GPUs of the following
generations:

- G13G (Apple M1)
- G13S (Apple M1 Pro)
- G13C (Apple M1 Max)
- G13D (Apple M1 Ultra)
- G14G (Apple M2)

Signed-off-by: Asahi Lina <lina at asahilina.net>
---
 drivers/gpu/drm/Kconfig                |    2 +
 drivers/gpu/drm/Makefile               |    1 +
 drivers/gpu/drm/asahi/Kconfig          |   35 +
 drivers/gpu/drm/asahi/Makefile         |    3 +
 drivers/gpu/drm/asahi/alloc.rs         | 1046 ++++++++++++++++++++++++++
 drivers/gpu/drm/asahi/asahi.rs         |   53 ++
 drivers/gpu/drm/asahi/buffer.rs        |  694 ++++++++++++++++++
 drivers/gpu/drm/asahi/channel.rs       |  542 ++++++++++++++
 drivers/gpu/drm/asahi/debug.rs         |  129 ++++
 drivers/gpu/drm/asahi/driver.rs        |  166 +++++
 drivers/gpu/drm/asahi/event.rs         |  229 ++++++
 drivers/gpu/drm/asahi/file.rs          |  718 ++++++++++++++++++
 drivers/gpu/drm/asahi/float.rs         |  381 ++++++++++
 drivers/gpu/drm/asahi/fw/buffer.rs     |  170 +++++
 drivers/gpu/drm/asahi/fw/channels.rs   |  385 ++++++++++
 drivers/gpu/drm/asahi/fw/compute.rs    |  107 +++
 drivers/gpu/drm/asahi/fw/event.rs      |  100 +++
 drivers/gpu/drm/asahi/fw/fragment.rs   |  276 +++++++
 drivers/gpu/drm/asahi/fw/initdata.rs   | 1264 ++++++++++++++++++++++++++++++++
 drivers/gpu/drm/asahi/fw/job.rs        |   56 ++
 drivers/gpu/drm/asahi/fw/microseq.rs   |  384 ++++++++++
 drivers/gpu/drm/asahi/fw/mod.rs        |   15 +
 drivers/gpu/drm/asahi/fw/types.rs      |  233 ++++++
 drivers/gpu/drm/asahi/fw/vertex.rs     |  177 +++++
 drivers/gpu/drm/asahi/fw/workqueue.rs  |  168 +++++
 drivers/gpu/drm/asahi/gem.rs           |  301 ++++++++
 drivers/gpu/drm/asahi/gpu.rs           | 1088 +++++++++++++++++++++++++++
 drivers/gpu/drm/asahi/hw/mod.rs        |  522 +++++++++++++
 drivers/gpu/drm/asahi/hw/t600x.rs      |  140 ++++
 drivers/gpu/drm/asahi/hw/t8103.rs      |   80 ++
 drivers/gpu/drm/asahi/hw/t8112.rs      |   82 +++
 drivers/gpu/drm/asahi/initdata.rs      |  777 ++++++++++++++++++++
 drivers/gpu/drm/asahi/mem.rs           |  133 ++++
 drivers/gpu/drm/asahi/microseq.rs      |   61 ++
 drivers/gpu/drm/asahi/mmu.rs           | 1249 +++++++++++++++++++++++++++++++
 drivers/gpu/drm/asahi/object.rs        |  704 ++++++++++++++++++
 drivers/gpu/drm/asahi/place.rs         |  343 +++++++++
 drivers/gpu/drm/asahi/queue/common.rs  |   52 ++
 drivers/gpu/drm/asahi/queue/compute.rs |  371 ++++++++++
 drivers/gpu/drm/asahi/queue/mod.rs     |  725 ++++++++++++++++++
 drivers/gpu/drm/asahi/queue/render.rs  | 1173 +++++++++++++++++++++++++++++
 drivers/gpu/drm/asahi/regs.rs          |  387 ++++++++++
 drivers/gpu/drm/asahi/slotalloc.rs     |  292 ++++++++
 drivers/gpu/drm/asahi/util.rs          |   44 ++
 drivers/gpu/drm/asahi/workqueue.rs     |  880 ++++++++++++++++++++++
 45 files changed, 16738 insertions(+)

diff --git a/drivers/gpu/drm/Kconfig b/drivers/gpu/drm/Kconfig
index 8b5ad6aee126..acdd42fc543a 100644
--- a/drivers/gpu/drm/Kconfig
+++ b/drivers/gpu/drm/Kconfig
@@ -384,6 +384,8 @@ source "drivers/gpu/drm/solomon/Kconfig"
 
 source "drivers/gpu/drm/sprd/Kconfig"
 
+source "drivers/gpu/drm/asahi/Kconfig"
+
 config DRM_HYPERV
 	tristate "DRM Support for Hyper-V synthetic video device"
 	depends on DRM && PCI && MMU && HYPERV
diff --git a/drivers/gpu/drm/Makefile b/drivers/gpu/drm/Makefile
index ab4460fcd63f..f144501d9c60 100644
--- a/drivers/gpu/drm/Makefile
+++ b/drivers/gpu/drm/Makefile
@@ -190,3 +190,4 @@ obj-y			+= gud/
 obj-$(CONFIG_DRM_HYPERV) += hyperv/
 obj-y			+= solomon/
 obj-$(CONFIG_DRM_SPRD) += sprd/
+obj-$(CONFIG_DRM_ASAHI) += asahi/
diff --git a/drivers/gpu/drm/asahi/Kconfig b/drivers/gpu/drm/asahi/Kconfig
new file mode 100644
index 000000000000..9d7caa49b64c
--- /dev/null
+++ b/drivers/gpu/drm/asahi/Kconfig
@@ -0,0 +1,35 @@
+# SPDX-License-Identifier: GPL-2.0
+
+config DRM_ASAHI
+	tristate "Asahi (DRM support for Apple AGX GPUs)"
+	depends on RUST
+	depends on RUST_DRM
+	depends on RUST_APPLE_RTKIT
+	depends on (ARM64 && ARCH_APPLE) || (COMPILE_TEST && !GENERIC_ATOMIC64)
+	depends on MMU
+	select IOMMU_SUPPORT
+	select IOMMU_IO_PGTABLE_LPAE
+	select RUST_DRM_SCHED
+	select RUST_DRM_GEM_SHMEM_HELPER
+	help
+	  DRM driver for Apple AGX GPUs (G13x/G14).
+
+	  This driver supports the following SoCs:
+
+	  - T8103 "M1"
+	  - T8112 "M2"
+	  - T6000 "M1 Pro"
+	  - T6001 "M1 Max"
+	  - T6002 "M1 Ultra"
+
+config DRM_ASAHI_DEBUG_ALLOCATOR
+	bool "Use debug allocator"
+	depends on DRM_ASAHI
+	help
+	  Use an alternate, simpler allocator which significantly reduces
+	  performance, but can help find firmware- or GPU-side memory safety
+	  issues. However, it can also trigger firmware bugs more easily,
+	  so expect GPU crashes.
+
+	  Say N unless you are debugging firmware structures or porting to a
+	  new firmware version.
diff --git a/drivers/gpu/drm/asahi/Makefile b/drivers/gpu/drm/asahi/Makefile
new file mode 100644
index 000000000000..e67248667987
--- /dev/null
+++ b/drivers/gpu/drm/asahi/Makefile
@@ -0,0 +1,3 @@
+# SPDX-License-Identifier: GPL-2.0
+
+obj-$(CONFIG_DRM_ASAHI) += asahi.o
diff --git a/drivers/gpu/drm/asahi/alloc.rs b/drivers/gpu/drm/asahi/alloc.rs
new file mode 100644
index 000000000000..d918b19e9721
--- /dev/null
+++ b/drivers/gpu/drm/asahi/alloc.rs
@@ -0,0 +1,1046 @@
+// SPDX-License-Identifier: GPL-2.0-only OR MIT
+
+//! GPU kernel object allocator.
+//!
+//! This kernel driver needs to manage a large number of GPU objects, in both firmware/kernel
+//! address space and user address space. This module implements a simple grow-only heap allocator
+//! based on the DRM MM range allocator, and a debug allocator that allocates each object as a
+//! separate GEM object.
+//!
+//! Allocations may optionally have debugging enabled, which adds preambles that store metadata
+//! about the allocation. This is useful for live debugging using the hypervisor or postmortem
+//! debugging with a GPU memory snapshot, since it makes it easier to identify use-after-free and
+//! caching issues.
+
+use kernel::{c_str, drm::mm, error::Result, prelude::*, str::CString, sync::LockClassKey};
+
+use crate::debug::*;
+use crate::driver::AsahiDevice;
+use crate::fw::types::Zeroed;
+use crate::mmu;
+use crate::object::{GpuArray, GpuObject, GpuOnlyArray, GpuStruct, GpuWeakPointer};
+
+use core::cmp::Ordering;
+use core::fmt;
+use core::fmt::{Debug, Formatter};
+use core::marker::PhantomData;
+use core::mem;
+use core::mem::MaybeUninit;
+use core::ptr::NonNull;
+
+const DEBUG_CLASS: DebugFlags = DebugFlags::Alloc;
+
+#[cfg(not(CONFIG_DRM_ASAHI_DEBUG_ALLOCATOR))]
+/// The driver-global allocator type
+pub(crate) type DefaultAllocator = HeapAllocator;
+
+#[cfg(not(CONFIG_DRM_ASAHI_DEBUG_ALLOCATOR))]
+/// The driver-global allocation type
+pub(crate) type DefaultAllocation = HeapAllocation;
+
+#[cfg(CONFIG_DRM_ASAHI_DEBUG_ALLOCATOR)]
+/// The driver-global allocator type
+pub(crate) type DefaultAllocator = SimpleAllocator;
+
+#[cfg(CONFIG_DRM_ASAHI_DEBUG_ALLOCATOR)]
+/// The driver-global allocation type
+pub(crate) type DefaultAllocation = SimpleAllocation;
+
+/// Represents a raw allocation (without any type information).
+pub(crate) trait RawAllocation {
+    /// Returns the CPU-side pointer (if CPU mapping is enabled) as a byte non-null pointer.
+    fn ptr(&self) -> Option<NonNull<u8>>;
+    /// Returns the GPU VA pointer as a u64.
+    fn gpu_ptr(&self) -> u64;
+    /// Returns the size of the allocation in bytes.
+    fn size(&self) -> usize;
+    /// Returns the AsahiDevice that owns this allocation.
+    fn device(&self) -> &AsahiDevice;
+}
+
+/// Represents a typed allocation.
+pub(crate) trait Allocation<T>: Debug {
+    /// Returns the typed CPU-side pointer (if CPU mapping is enabled).
+    fn ptr(&self) -> Option<NonNull<T>>;
+    /// Returns the GPU VA pointer as a u64.
+    fn gpu_ptr(&self) -> u64;
+    /// Returns the size of the allocation in bytes.
+    fn size(&self) -> usize;
+    /// Returns the AsahiDevice that owns this allocation.
+    fn device(&self) -> &AsahiDevice;
+}
+
+/// A generic typed allocation wrapping a RawAllocation.
+///
+/// This is currently the only Allocation implementation, since it is shared by all allocators.
+pub(crate) struct GenericAlloc<T, U: RawAllocation> {
+    alloc: U,
+    alloc_size: usize,
+    debug_offset: usize,
+    padding: usize,
+    _p: PhantomData<T>,
+}
+
+impl<T, U: RawAllocation> Allocation<T> for GenericAlloc<T, U> {
+    fn ptr(&self) -> Option<NonNull<T>> {
+        self.alloc
+            .ptr()
+            .map(|p| unsafe { NonNull::new_unchecked(p.as_ptr().add(self.debug_offset) as *mut T) })
+    }
+    fn gpu_ptr(&self) -> u64 {
+        self.alloc.gpu_ptr() + self.debug_offset as u64
+    }
+    fn size(&self) -> usize {
+        self.alloc_size
+    }
+    fn device(&self) -> &AsahiDevice {
+        self.alloc.device()
+    }
+}
+
+impl<T, U: RawAllocation> Debug for GenericAlloc<T, U> {
+    fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
+        f.debug_struct(core::any::type_name::<GenericAlloc<T, U>>())
+            .field("ptr", &format_args!("{:?}", self.ptr()))
+            .field("gpu_ptr", &format_args!("{:#X?}", self.gpu_ptr()))
+            .field("size", &format_args!("{:#X?}", self.size()))
+            .finish()
+    }
+}
+
+/// Debugging data associated with an allocation, when debugging is enabled.
+#[repr(C)]
+struct AllocDebugData {
+    state: u32,
+    _pad: u32,
+    size: u64,
+    base_gpuva: u64,
+    obj_gpuva: u64,
+    name: [u8; 0x20],
+}
+
+/// Magic flag indicating a live allocation.
+const STATE_LIVE: u32 = 0x4556494c;
+/// Magic flag indicating a freed allocation.
+const STATE_DEAD: u32 = 0x44414544;
+
+/// Marker byte to identify when firmware/GPU write beyond the end of an allocation.
+const GUARD_MARKER: u8 = 0x93;
+
+impl<T, U: RawAllocation> Drop for GenericAlloc<T, U> {
+    fn drop(&mut self) {
+        let debug_len = mem::size_of::<AllocDebugData>();
+        if self.debug_offset >= debug_len {
+            if let Some(p) = self.alloc.ptr() {
+                unsafe {
+                    let p = p.as_ptr().add(self.debug_offset - debug_len);
+                    (p as *mut u32).write(STATE_DEAD);
+                }
+            }
+        }
+        if debug_enabled(DebugFlags::FillAllocations) {
+            if let Some(p) = self.ptr() {
+                unsafe { (p.as_ptr() as *mut u8).write_bytes(0xde, self.size()) };
+            }
+        }
+        if self.padding != 0 {
+            if let Some(p) = self.ptr() {
+                let guard = unsafe {
+                    core::slice::from_raw_parts(
+                        (p.as_ptr() as *mut u8 as *const u8).add(self.size()),
+                        self.padding,
+                    )
+                };
+                if let Some(first_err) = guard.iter().position(|&r| r != GUARD_MARKER) {
+                    let last_err = guard
+                        .iter()
+                        .rev()
+                        .position(|&r| r != GUARD_MARKER)
+                        .unwrap_or(0);
+                    dev_warn!(
+                        self.device(),
+                        "Allocator: Corruption after object of type {} at {:#x}:{:#x} + {:#x}..={:#x}\n",
+                        core::any::type_name::<T>(),
+                        self.gpu_ptr(),
+                        self.size(),
+                        first_err,
+                        self.padding - last_err - 1
+                    );
+                }
+            }
+        }
+    }
+}
+
+static_assert!(mem::size_of::<AllocDebugData>() == 0x40);
+
+/// A trait representing an allocator.
+pub(crate) trait Allocator {
+    /// The raw allocation type used by this allocator.
+    type Raw: RawAllocation;
+    // TODO: Needs associated_type_defaults
+    // type Allocation<T> = GenericAlloc<T, Self::Raw>;
+
+    /// Returns the `AsahiDevice` associated with this allocator.
+    fn device(&self) -> &AsahiDevice;
+    /// Returns whether CPU-side mapping is enabled.
+    fn cpu_maps(&self) -> bool;
+    /// Returns the minimum alignment for allocations.
+    fn min_align(&self) -> usize;
+    /// Allocate an object of the given size in bytes with the given alignment.
+    fn alloc(&mut self, size: usize, align: usize) -> Result<Self::Raw>;
+
+    /// Returns a tuple of (count, size) of how much garbage (freed but not yet reusable objects)
+    /// exists in this allocator. Optional.
+    fn garbage(&self) -> (usize, usize) {
+        (0, 0)
+    }
+    /// Collect garbage for this allocator, up to the given object count. Optional.
+    fn collect_garbage(&mut self, _count: usize) {}
+
+    /// Allocate a new GpuStruct object. See [`GpuObject::new`].
+    #[inline(never)]
+    fn new_object<T: GpuStruct>(
+        &mut self,
+        inner: T,
+        callback: impl for<'a> FnOnce(&'a T) -> T::Raw<'a>,
+    ) -> Result<GpuObject<T, GenericAlloc<T, Self::Raw>>> {
+        GpuObject::<T, GenericAlloc<T, Self::Raw>>::new(self.alloc_object()?, inner, callback)
+    }
+
+    /// Allocate a new GpuStruct object. See [`GpuObject::new_boxed`].
+    #[inline(never)]
+    fn new_boxed<T: GpuStruct>(
+        &mut self,
+        inner: Box<T>,
+        callback: impl for<'a> FnOnce(
+            &'a T,
+            &'a mut MaybeUninit<T::Raw<'a>>,
+        ) -> Result<&'a mut T::Raw<'a>>,
+    ) -> Result<GpuObject<T, GenericAlloc<T, Self::Raw>>> {
+        GpuObject::<T, GenericAlloc<T, Self::Raw>>::new_boxed(self.alloc_object()?, inner, callback)
+    }
+
+    /// Allocate a new GpuStruct object. See [`GpuObject::new_inplace`].
+    #[inline(never)]
+    fn new_inplace<T: GpuStruct>(
+        &mut self,
+        inner: T,
+        callback: impl for<'a> FnOnce(
+            &'a T,
+            &'a mut MaybeUninit<T::Raw<'a>>,
+        ) -> Result<&'a mut T::Raw<'a>>,
+    ) -> Result<GpuObject<T, GenericAlloc<T, Self::Raw>>> {
+        GpuObject::<T, GenericAlloc<T, Self::Raw>>::new_inplace(
+            self.alloc_object()?,
+            inner,
+            callback,
+        )
+    }
+
+    /// Allocate a new GpuStruct object. See [`GpuObject::new_default`].
+    #[inline(never)]
+    fn new_default<T: GpuStruct + Default>(
+        &mut self,
+    ) -> Result<GpuObject<T, GenericAlloc<T, Self::Raw>>>
+    where
+        for<'a> <T as GpuStruct>::Raw<'a>: Default + Zeroed,
+    {
+        GpuObject::<T, GenericAlloc<T, Self::Raw>>::new_default(self.alloc_object()?)
+    }
+
+    /// Allocate a new GpuStruct object. See [`GpuObject::new_prealloc`].
+    #[inline(never)]
+    fn new_prealloc<T: GpuStruct>(
+        &mut self,
+        inner_cb: impl FnOnce(GpuWeakPointer<T>) -> Result<Box<T>>,
+        raw_cb: impl for<'a> FnOnce(
+            &'a T,
+            &'a mut MaybeUninit<T::Raw<'a>>,
+        ) -> Result<&'a mut T::Raw<'a>>,
+    ) -> Result<GpuObject<T, GenericAlloc<T, Self::Raw>>> {
+        GpuObject::<T, GenericAlloc<T, Self::Raw>>::new_prealloc(
+            self.alloc_object()?,
+            inner_cb,
+            raw_cb,
+        )
+    }
+
+    /// Allocate a generic buffer of the given size and alignment, applying the debug features if
+    /// enabled to tag it and detect overflows.
+    fn alloc_generic<T>(
+        &mut self,
+        size: usize,
+        align: usize,
+    ) -> Result<GenericAlloc<T, Self::Raw>> {
+        let padding = if debug_enabled(DebugFlags::DetectOverflows) {
+            size
+        } else {
+            0
+        };
+
+        let ret: GenericAlloc<T, Self::Raw> =
+            if self.cpu_maps() && debug_enabled(debug::DebugFlags::DebugAllocations) {
+                let debug_align = self.min_align().max(align);
+                let debug_len = mem::size_of::<AllocDebugData>();
+                let debug_offset = (debug_len * 2 + debug_align - 1) & !(debug_align - 1);
+
+                let alloc = self.alloc(size + debug_offset + padding, align)?;
+
+                let mut debug = AllocDebugData {
+                    state: STATE_LIVE,
+                    _pad: 0,
+                    size: size as u64,
+                    base_gpuva: alloc.gpu_ptr(),
+                    obj_gpuva: alloc.gpu_ptr() + debug_offset as u64,
+                    name: [0; 0x20],
+                };
+
+                let name = core::any::type_name::<T>().as_bytes();
+                let len = name.len().min(debug.name.len() - 1);
+                debug.name[..len].copy_from_slice(&name[..len]);
+
+                if let Some(p) = alloc.ptr() {
+                    unsafe {
+                        let p = p.as_ptr();
+                        p.write_bytes(0x42, debug_offset - 2 * debug_len);
+                        let cur = p.add(debug_offset - debug_len) as *mut AllocDebugData;
+                        let prev = p.add(debug_offset - 2 * debug_len) as *mut AllocDebugData;
+                        prev.copy_from(cur, 1);
+                        cur.copy_from(&debug, 1);
+                    };
+                }
+
+                GenericAlloc {
+                    alloc,
+                    alloc_size: size,
+                    debug_offset,
+                    padding,
+                    _p: PhantomData,
+                }
+            } else {
+                GenericAlloc {
+                    alloc: self.alloc(size + padding, align)?,
+                    alloc_size: size,
+                    debug_offset: 0,
+                    padding,
+                    _p: PhantomData,
+                }
+            };
+
+        if debug_enabled(DebugFlags::FillAllocations) {
+            if let Some(p) = ret.ptr() {
+                unsafe { (p.as_ptr() as *mut u8).write_bytes(0xaa, ret.size()) };
+            }
+        }
+
+        if padding != 0 {
+            if let Some(p) = ret.ptr() {
+                unsafe {
+                    (p.as_ptr() as *mut u8)
+                        .add(ret.size())
+                        .write_bytes(GUARD_MARKER, padding);
+                }
+            }
+        }
+
+        Ok(ret)
+    }
+
+    /// Allocate an object of a given type, without actually initializing the allocation.
+    ///
+    /// This is useful to directly call [`GpuObject::new_*`], without borrowing a reference to the
+    /// allocator for the entire duration (e.g. if further allocations need to happen inside the
+    /// callbacks).
+    fn alloc_object<T: GpuStruct>(&mut self) -> Result<GenericAlloc<T, Self::Raw>> {
+        let size = mem::size_of::<T::Raw<'static>>();
+        let align = mem::align_of::<T::Raw<'static>>();
+
+        self.alloc_generic(size, align)
+    }
+
+    /// Allocate an empty `GpuArray` of a given type and length.
+    fn array_empty<T: Sized + Default>(
+        &mut self,
+        count: usize,
+    ) -> Result<GpuArray<T, GenericAlloc<T, Self::Raw>>> {
+        let size = mem::size_of::<T>() * count;
+        let align = mem::align_of::<T>();
+
+        let alloc = self.alloc_generic(size, align)?;
+        GpuArray::<T, GenericAlloc<T, Self::Raw>>::empty(alloc, count)
+    }
+
+    /// Allocate an empty `GpuOnlyArray` of a given type and length.
+    fn array_gpuonly<T: Sized + Default>(
+        &mut self,
+        count: usize,
+    ) -> Result<GpuOnlyArray<T, GenericAlloc<T, Self::Raw>>> {
+        let size = mem::size_of::<T>() * count;
+        let align = mem::align_of::<T>();
+
+        let alloc = self.alloc_generic(size, align)?;
+        GpuOnlyArray::<T, GenericAlloc<T, Self::Raw>>::new(alloc, count)
+    }
+}
+
+/// A simple allocation backed by a separate GEM object.
+///
+/// # Invariants
+/// `ptr` is either None or a valid, non-null pointer to the CPU view of the object.
+/// `gpu_ptr` is the GPU-side VA of the object.
+pub(crate) struct SimpleAllocation {
+    dev: AsahiDevice,
+    ptr: Option<NonNull<u8>>,
+    gpu_ptr: u64,
+    size: usize,
+    vm: mmu::Vm,
+    obj: crate::gem::ObjectRef,
+}
+
+/// SAFETY: `SimpleAllocation` just points to raw memory and should be safe to send across threads.
+unsafe impl Send for SimpleAllocation {}
+unsafe impl Sync for SimpleAllocation {}
+
+impl Drop for SimpleAllocation {
+    fn drop(&mut self) {
+        mod_dev_dbg!(
+            self.device(),
+            "SimpleAllocator: drop object @ {:#x}\n",
+            self.gpu_ptr()
+        );
+        if debug_enabled(DebugFlags::FillAllocations) {
+            if let Ok(vmap) = self.obj.vmap() {
+                vmap.as_mut_slice().fill(0x42);
+            }
+        }
+        self.obj.drop_vm_mappings(self.vm.id());
+    }
+}
+
+impl RawAllocation for SimpleAllocation {
+    fn ptr(&self) -> Option<NonNull<u8>> {
+        self.ptr
+    }
+    fn gpu_ptr(&self) -> u64 {
+        self.gpu_ptr
+    }
+    fn size(&self) -> usize {
+        self.size
+    }
+
+    fn device(&self) -> &AsahiDevice {
+        &self.dev
+    }
+}
+
+/// A simple allocator that allocates each object as its own GEM object, aligned to the end of a
+/// page.
+///
+/// This is very slow, but it has the advantage that over-reads by the firmware or GPU will fault on
+/// the guard page after the allocation, which can be useful to validate that the firmware's or
+/// GPU's idea of object size what we expect.
+pub(crate) struct SimpleAllocator {
+    dev: AsahiDevice,
+    start: u64,
+    end: u64,
+    prot: u32,
+    vm: mmu::Vm,
+    min_align: usize,
+    cpu_maps: bool,
+}
+
+impl SimpleAllocator {
+    /// Create a new `SimpleAllocator` for a given address range and `Vm`.
+    #[allow(dead_code)]
+    #[allow(clippy::too_many_arguments)]
+    pub(crate) fn new(
+        dev: &AsahiDevice,
+        vm: &mmu::Vm,
+        start: u64,
+        end: u64,
+        min_align: usize,
+        prot: u32,
+        _block_size: usize,
+        mut cpu_maps: bool,
+        _name: fmt::Arguments<'_>,
+        _keep_garbage: bool,
+    ) -> Result<SimpleAllocator> {
+        if debug_enabled(DebugFlags::ForceCPUMaps) {
+            cpu_maps = true;
+        }
+        Ok(SimpleAllocator {
+            dev: dev.clone(),
+            vm: vm.clone(),
+            start,
+            end,
+            prot,
+            min_align,
+            cpu_maps,
+        })
+    }
+}
+
+impl Allocator for SimpleAllocator {
+    type Raw = SimpleAllocation;
+
+    fn device(&self) -> &AsahiDevice {
+        &self.dev
+    }
+
+    fn cpu_maps(&self) -> bool {
+        self.cpu_maps
+    }
+
+    fn min_align(&self) -> usize {
+        self.min_align
+    }
+
+    #[inline(never)]
+    fn alloc(&mut self, size: usize, align: usize) -> Result<SimpleAllocation> {
+        let size_aligned = (size + mmu::UAT_PGSZ - 1) & !mmu::UAT_PGMSK;
+        let align = self.min_align.max(align);
+        let offset = (size_aligned - size) & !(align - 1);
+
+        mod_dev_dbg!(
+            &self.dev,
+            "SimpleAllocator::new: size={:#x} size_al={:#x} al={:#x} off={:#x}\n",
+            size,
+            size_aligned,
+            align,
+            offset
+        );
+
+        let mut obj = crate::gem::new_kernel_object(&self.dev, size_aligned)?;
+        let p = obj.vmap()?.as_mut_ptr() as *mut u8;
+        if debug_enabled(DebugFlags::FillAllocations) {
+            obj.vmap()?.as_mut_slice().fill(0xde);
+        }
+        let iova = obj.map_into_range(
+            &self.vm,
+            self.start,
+            self.end,
+            self.min_align.max(mmu::UAT_PGSZ) as u64,
+            self.prot,
+            true,
+        )?;
+
+        let ptr = unsafe { p.add(offset) } as *mut u8;
+        let gpu_ptr = (iova + offset) as u64;
+
+        mod_dev_dbg!(
+            &self.dev,
+            "SimpleAllocator::new -> {:#?} / {:#?} | {:#x} / {:#x}\n",
+            p,
+            ptr,
+            iova,
+            gpu_ptr
+        );
+
+        Ok(SimpleAllocation {
+            dev: self.dev.clone(),
+            ptr: NonNull::new(ptr),
+            gpu_ptr,
+            size,
+            vm: self.vm.clone(),
+            obj,
+        })
+    }
+}
+
+/// Inner data for an allocation from the heap allocator.
+///
+/// This is wrapped in an `mm::Node`.
+pub(crate) struct HeapAllocationInner {
+    dev: AsahiDevice,
+    ptr: Option<NonNull<u8>>,
+    real_size: usize,
+}
+
+/// SAFETY: `SimpleAllocation` just points to raw memory and should be safe to send across threads.
+unsafe impl Send for HeapAllocationInner {}
+unsafe impl Sync for HeapAllocationInner {}
+
+/// Outer view of a heap allocation.
+///
+/// This uses an Option<> so we can move the internal `Node` into the garbage pool when it gets
+/// dropped.
+///
+/// # Invariants
+/// The `Option` must always be `Some(...)` while this object is alive.
+pub(crate) struct HeapAllocation(Option<mm::Node<HeapAllocatorInner, HeapAllocationInner>>);
+
+impl Drop for HeapAllocation {
+    fn drop(&mut self) {
+        let node = self.0.take().unwrap();
+        let size = node.size();
+        let alloc = node.alloc_ref();
+
+        alloc.with(|a| {
+            if let Some(garbage) = a.garbage.as_mut() {
+                if garbage.try_push(node).is_err() {
+                    dev_err!(
+                        &a.dev,
+                        "HeapAllocation[{}]::drop: Failed to keep garbage\n",
+                        &*a.name,
+                    );
+                }
+                a.total_garbage += size as usize;
+                None
+            } else {
+                // We need to ensure node survives this scope, since dropping it
+                // will try to take the mm lock and deadlock us
+                Some(node)
+            }
+        });
+    }
+}
+
+impl mm::AllocInner<HeapAllocationInner> for HeapAllocatorInner {
+    fn drop_object(
+        &mut self,
+        start: u64,
+        _size: u64,
+        _color: usize,
+        obj: &mut HeapAllocationInner,
+    ) {
+        /* real_size == 0 means it's a guard node */
+        if obj.real_size > 0 {
+            mod_dev_dbg!(
+                obj.dev,
+                "HeapAllocator[{}]: drop object @ {:#x} ({} bytes)\n",
+                &*self.name,
+                start,
+                obj.real_size,
+            );
+            self.allocated -= obj.real_size;
+        }
+    }
+}
+
+impl RawAllocation for HeapAllocation {
+    // SAFETY: This function must always return a valid pointer.
+    // Since the HeapAllocation contains a reference to the
+    // backing_objects array that contains the object backing this pointer,
+    // and objects are only ever added to it, this pointer is guaranteed to
+    // remain valid for the lifetime of the HeapAllocation.
+    fn ptr(&self) -> Option<NonNull<u8>> {
+        self.0.as_ref().unwrap().ptr
+    }
+    // SAFETY: This function must always return a valid GPU pointer.
+    // See the explanation in ptr().
+    fn gpu_ptr(&self) -> u64 {
+        self.0.as_ref().unwrap().start()
+    }
+    fn size(&self) -> usize {
+        self.0.as_ref().unwrap().size() as usize
+    }
+    fn device(&self) -> &AsahiDevice {
+        &self.0.as_ref().unwrap().dev
+    }
+}
+
+/// Inner data for a heap allocator which uses the DRM MM range allocator to manage the heap.
+///
+/// This is wrapped by an `mm::Allocator`.
+struct HeapAllocatorInner {
+    dev: AsahiDevice,
+    allocated: usize,
+    backing_objects: Vec<(crate::gem::ObjectRef, u64)>,
+    garbage: Option<Vec<mm::Node<HeapAllocatorInner, HeapAllocationInner>>>,
+    total_garbage: usize,
+    name: CString,
+    vm_id: u64,
+}
+
+/// A heap allocator which uses the DRM MM range allocator to manage its objects.
+///
+/// The heap is composed of a series of GEM objects. This implementation only ever grows the heap,
+/// never shrinks it.
+pub(crate) struct HeapAllocator {
+    dev: AsahiDevice,
+    start: u64,
+    end: u64,
+    top: u64,
+    prot: u32,
+    vm: mmu::Vm,
+    min_align: usize,
+    block_size: usize,
+    cpu_maps: bool,
+    guard_nodes: Vec<mm::Node<HeapAllocatorInner, HeapAllocationInner>>,
+    mm: mm::Allocator<HeapAllocatorInner, HeapAllocationInner>,
+    name: CString,
+}
+
+static LOCK_KEY: LockClassKey = LockClassKey::new();
+
+impl HeapAllocator {
+    /// Create a new HeapAllocator for a given `Vm` and address range.
+    #[allow(dead_code)]
+    #[allow(clippy::too_many_arguments)]
+    pub(crate) fn new(
+        dev: &AsahiDevice,
+        vm: &mmu::Vm,
+        start: u64,
+        end: u64,
+        min_align: usize,
+        prot: u32,
+        block_size: usize,
+        mut cpu_maps: bool,
+        name: fmt::Arguments<'_>,
+        keep_garbage: bool,
+    ) -> Result<HeapAllocator> {
+        if !min_align.is_power_of_two() {
+            return Err(EINVAL);
+        }
+        if debug_enabled(DebugFlags::ForceCPUMaps) {
+            cpu_maps = true;
+        }
+
+        let name = CString::try_from_fmt(name)?;
+
+        let inner = HeapAllocatorInner {
+            dev: dev.clone(),
+            allocated: 0,
+            backing_objects: Vec::new(),
+            // TODO: This clearly needs a try_clone() or similar
+            name: CString::try_from_fmt(fmt!("{}", &*name))?,
+            vm_id: vm.id(),
+            garbage: if keep_garbage { Some(Vec::new()) } else { None },
+            total_garbage: 0,
+        };
+
+        let mm = mm::Allocator::new(
+            start,
+            end - start + 1,
+            inner,
+            c_str!("HeapAllocator"),
+            &LOCK_KEY,
+        )?;
+
+        Ok(HeapAllocator {
+            dev: dev.clone(),
+            vm: vm.clone(),
+            start,
+            end,
+            top: start,
+            prot,
+            min_align,
+            block_size: block_size.max(min_align),
+            cpu_maps,
+            guard_nodes: Vec::new(),
+            mm,
+            name,
+        })
+    }
+
+    /// Add a new backing block of the given size to this heap.
+    ///
+    /// If CPU mapping is enabled, this also adds a guard node to the range allocator to ensure that
+    /// objects cannot straddle backing block boundaries, since we cannot easily create a contiguous
+    /// CPU VA mapping for them. This can create some fragmentation. If CPU mapping is disabled, we
+    /// skip the guard blocks, since the GPU view of the heap is always contiguous.
+    fn add_block(&mut self, size: usize) -> Result {
+        let size_aligned = (size + mmu::UAT_PGSZ - 1) & !mmu::UAT_PGMSK;
+
+        mod_dev_dbg!(
+            &self.dev,
+            "HeapAllocator[{}]::add_block: size={:#x} size_al={:#x}\n",
+            &*self.name,
+            size,
+            size_aligned,
+        );
+
+        if self.top.saturating_add(size_aligned as u64) >= self.end {
+            dev_err!(
+                &self.dev,
+                "HeapAllocator[{}]::add_block: Exhausted VA space\n",
+                &*self.name,
+            );
+        }
+
+        let mut obj = crate::gem::new_kernel_object(&self.dev, size_aligned)?;
+        if self.cpu_maps && debug_enabled(DebugFlags::FillAllocations) {
+            obj.vmap()?.as_mut_slice().fill(0xde);
+        }
+
+        let gpu_ptr = self.top;
+        if let Err(e) = obj.map_at(&self.vm, gpu_ptr, self.prot, self.cpu_maps) {
+            dev_err!(
+                &self.dev,
+                "HeapAllocator[{}]::add_block: Failed to map at {:#x} ({:?})\n",
+                &*self.name,
+                gpu_ptr,
+                e
+            );
+            return Err(e);
+        }
+
+        self.mm
+            .with_inner(|inner| inner.backing_objects.try_reserve(1))?;
+
+        let mut new_top = self.top + size_aligned as u64;
+        if self.cpu_maps {
+            let guard = self.min_align.max(mmu::UAT_PGSZ);
+            mod_dev_dbg!(
+                &self.dev,
+                "HeapAllocator[{}]::add_block: Adding guard node {:#x}:{:#x}\n",
+                &*self.name,
+                new_top,
+                guard
+            );
+
+            let inner = HeapAllocationInner {
+                dev: self.dev.clone(),
+                ptr: None,
+                real_size: 0,
+            };
+
+            let node = match self.mm.reserve_node(inner, new_top, guard as u64, 0) {
+                Ok(a) => a,
+                Err(a) => {
+                    dev_err!(
+                        &self.dev,
+                        "HeapAllocator[{}]::add_block: Failed to reserve guard node {:#x}:{:#x}: {:?}\n",
+                        &*self.name,
+                        guard,
+                        new_top,
+                        a
+                    );
+                    return Err(EIO);
+                }
+            };
+
+            self.guard_nodes.try_push(node)?;
+
+            new_top += guard as u64;
+        }
+        mod_dev_dbg!(
+            &self.dev,
+            "HeapAllocator[{}]::add_block: top={:#x}\n",
+            &*self.name,
+            new_top
+        );
+
+        self.mm
+            .with_inner(|inner| inner.backing_objects.try_push((obj, gpu_ptr)))?;
+
+        self.top = new_top;
+
+        cls_dev_dbg!(
+            MemStats,
+            &self.dev,
+            "{} Heap: grow to {} bytes\n",
+            &*self.name,
+            self.top - self.start
+        );
+
+        Ok(())
+    }
+
+    /// Find the backing object index that backs a given GPU address.
+    fn find_obj(&mut self, addr: u64) -> Result<usize> {
+        self.mm.with_inner(|inner| {
+            inner
+                .backing_objects
+                .binary_search_by(|obj| {
+                    let start = obj.1;
+                    let end = obj.1 + obj.0.size() as u64;
+                    if start > addr {
+                        Ordering::Greater
+                    } else if end <= addr {
+                        Ordering::Less
+                    } else {
+                        Ordering::Equal
+                    }
+                })
+                .or(Err(ENOENT))
+        })
+    }
+}
+
+impl Allocator for HeapAllocator {
+    type Raw = HeapAllocation;
+
+    fn device(&self) -> &AsahiDevice {
+        &self.dev
+    }
+
+    fn cpu_maps(&self) -> bool {
+        self.cpu_maps
+    }
+
+    fn min_align(&self) -> usize {
+        self.min_align
+    }
+
+    fn alloc(&mut self, size: usize, align: usize) -> Result<HeapAllocation> {
+        if align != 0 && !align.is_power_of_two() {
+            return Err(EINVAL);
+        }
+        let align = self.min_align.max(align);
+        let size_aligned = (size + align - 1) & !(align - 1);
+
+        mod_dev_dbg!(
+            &self.dev,
+            "HeapAllocator[{}]::new: size={:#x} size_al={:#x}\n",
+            &*self.name,
+            size,
+            size_aligned,
+        );
+
+        let inner = HeapAllocationInner {
+            dev: self.dev.clone(),
+            ptr: None,
+            real_size: size,
+        };
+
+        let mut node = match self.mm.insert_node_generic(
+            inner,
+            size_aligned as u64,
+            align as u64,
+            0,
+            mm::InsertMode::Best,
+        ) {
+            Ok(a) => a,
+            Err(a) => {
+                dev_err!(
+                    &self.dev,
+                    "HeapAllocator[{}]::new: Failed to insert node of size {:#x} / align {:#x}: {:?}\n",
+                    &*self.name, size_aligned, align, a
+                );
+                return Err(a);
+            }
+        };
+
+        self.mm.with_inner(|inner| inner.allocated += size);
+
+        let mut new_object = false;
+        let start = node.start();
+        let end = start + node.size();
+        if end > self.top {
+            if start > self.top {
+                dev_warn!(
+                    self.dev,
+                    "HeapAllocator[{}]::alloc: top={:#x}, start={:#x}\n",
+                    &*self.name,
+                    self.top,
+                    start
+                );
+            }
+            let block_size = self.block_size.max((end - self.top) as usize);
+            self.add_block(block_size)?;
+            new_object = true;
+        }
+        assert!(end <= self.top);
+
+        if self.cpu_maps {
+            mod_dev_dbg!(
+                self.dev,
+                "HeapAllocator[{}]::alloc: mapping to CPU\n",
+                &*self.name
+            );
+
+            let idx = if new_object {
+                None
+            } else {
+                Some(match self.find_obj(start) {
+                    Ok(a) => a,
+                    Err(_) => {
+                        dev_warn!(
+                            self.dev,
+                            "HeapAllocator[{}]::alloc: Failed to find object at {:#x}\n",
+                            &*self.name,
+                            start
+                        );
+                        return Err(EIO);
+                    }
+                })
+            };
+            let (obj_start, obj_size, p) = self.mm.with_inner(|inner| -> Result<_> {
+                let idx = idx.unwrap_or(inner.backing_objects.len() - 1);
+                let obj = &mut inner.backing_objects[idx];
+                let p = obj.0.vmap()?.as_mut_ptr() as *mut u8;
+                Ok((obj.1, obj.0.size(), p))
+            })?;
+            assert!(obj_start <= start);
+            assert!(obj_start + obj_size as u64 >= end);
+            node.as_mut().inner_mut().ptr =
+                NonNull::new(unsafe { p.add((start - obj_start) as usize) });
+            mod_dev_dbg!(
+                self.dev,
+                "HeapAllocator[{}]::alloc: CPU pointer = {:?}\n",
+                &*self.name,
+                node.ptr
+            );
+        }
+
+        mod_dev_dbg!(
+            self.dev,
+            "HeapAllocator[{}]::alloc: Allocated {:#x} bytes @ {:#x}\n",
+            &*self.name,
+            end - start,
+            start
+        );
+
+        Ok(HeapAllocation(Some(node)))
+    }
+
+    fn garbage(&self) -> (usize, usize) {
+        self.mm.with_inner(|inner| {
+            if let Some(g) = inner.garbage.as_ref() {
+                (g.len(), inner.total_garbage)
+            } else {
+                (0, 0)
+            }
+        })
+    }
+
+    fn collect_garbage(&mut self, count: usize) {
+        // Take the garbage out of the inner block, so we can safely drop it without deadlocking
+        let mut garbage = Vec::new();
+
+        if garbage.try_reserve(count).is_err() {
+            dev_crit!(
+                self.dev,
+                "HeapAllocator[{}]:collect_garbage: failed to reserve space\n",
+                &*self.name,
+            );
+            return;
+        }
+
+        self.mm.with_inner(|inner| {
+            if let Some(g) = inner.garbage.as_mut() {
+                for node in g.drain(0..count) {
+                    inner.total_garbage -= node.size() as usize;
+                    garbage
+                        .try_push(node)
+                        .expect("try_push() failed after reserve()");
+                }
+            }
+        });
+    }
+}
+
+impl Drop for HeapAllocatorInner {
+    fn drop(&mut self) {
+        mod_dev_dbg!(
+            self.dev,
+            "HeapAllocator[{}]: dropping allocator\n",
+            &*self.name
+        );
+        if self.allocated > 0 {
+            // This should never happen
+            dev_crit!(
+                self.dev,
+                "HeapAllocator[{}]: dropping with {} bytes allocated\n",
+                &*self.name,
+                self.allocated
+            );
+        } else {
+            for mut obj in self.backing_objects.drain(..) {
+                obj.0.drop_vm_mappings(self.vm_id);
+            }
+        }
+    }
+}
diff --git a/drivers/gpu/drm/asahi/asahi.rs b/drivers/gpu/drm/asahi/asahi.rs
new file mode 100644
index 000000000000..e511d83f4cd1
--- /dev/null
+++ b/drivers/gpu/drm/asahi/asahi.rs
@@ -0,0 +1,53 @@
+// SPDX-License-Identifier: GPL-2.0-only OR MIT
+#![recursion_limit = "1024"]
+
+//! Driver for the Apple AGX GPUs found in Apple Silicon SoCs.
+
+mod alloc;
+mod buffer;
+mod channel;
+mod debug;
+mod driver;
+mod event;
+mod file;
+mod float;
+mod fw;
+mod gem;
+mod gpu;
+mod hw;
+mod initdata;
+mod mem;
+mod microseq;
+mod mmu;
+mod object;
+mod place;
+mod queue;
+mod regs;
+mod slotalloc;
+mod util;
+mod workqueue;
+
+use kernel::module_platform_driver;
+
+module_platform_driver! {
+    type: driver::AsahiDriver,
+    name: "asahi",
+    license: "Dual MIT/GPL",
+    params: {
+        debug_flags: u64 {
+            default: 0,
+            permissions: 0o644,
+            description: "Debug flags",
+        },
+        fault_control: u32 {
+            default: 0,
+            permissions: 0,
+            description: "Fault control (0x0: hard faults, 0xb: macOS default)",
+        },
+        initial_tvb_size: usize {
+            default: 0x8,
+            permissions: 0o644,
+            description: "Initial TVB size in blocks",
+        },
+    },
+}
diff --git a/drivers/gpu/drm/asahi/buffer.rs b/drivers/gpu/drm/asahi/buffer.rs
new file mode 100644
index 000000000000..767ea161176f
--- /dev/null
+++ b/drivers/gpu/drm/asahi/buffer.rs
@@ -0,0 +1,694 @@
+// SPDX-License-Identifier: GPL-2.0-only OR MIT
+
+//! Tiled Vertex Buffer management
+//!
+//! This module manages the Tiled Vertex Buffer, also known as the Parameter Buffer (in imgtec
+//! parlance) or the tiler heap (on other architectures). This buffer holds transformed primitive
+//! data between the vertex/tiling stage and the fragment stage.
+//!
+//! On AGX, the buffer is a heap of 128K blocks split into 32K pages (which must be aligned to a
+//! multiple of 32K in VA space). The buffer can be shared between multiple render jobs, and each
+//! will allocate pages from it during vertex processing and return them during fragment processing.
+//!
+//! If the buffer runs out of free pages, the vertex pass stops and a partial fragment pass occurs,
+//! spilling the intermediate render target state to RAM (a partial render). This is all managed
+//! transparently by the firmware. Since partial renders are less efficient, the kernel must grow
+//! the heap in response to feedback from the firmware to avoid partial renders in the future.
+//! Currently, we only ever grow the heap, and never shrink it.
+//!
+//! AGX also supports memoryless render targets, which can be used for intermediate results within
+//! a render pass. To support partial renders, it seems the GPU/firmware has the ability to borrow
+//! pages from the TVB buffer as a temporary render target buffer. Since this happens during a
+//! partial render itself, if the buffer runs out of space, it requires synchronous growth in
+//! response to a firmware interrupt. This is not currently supported, but may be in the future,
+//! though it is unclear whether it is worth the effort.
+//!
+//! This module is also in charge of managing the temporary objects associated with a single render
+//! pass, which includes the top-level tile array, the tail pointer cache, preemption buffers, and
+//! other miscellaneous structures collectively managed as a "scene".
+//!
+//! To avoid runaway memory usage, there is a maximum size for buffers (at that point it's unlikely
+//! that partial renders will incur much overhead over the buffer data access itself). This is
+//! different depending on whether memoryless render targets are in use, and is currently hardcoded.
+//! to the most common value used by macOS.
+
+use crate::debug::*;
+use crate::fw::buffer;
+use crate::fw::types::*;
+use crate::util::*;
+use crate::{alloc, fw, gpu, mmu, slotalloc};
+use crate::{box_in_place, place};
+use core::sync::atomic::Ordering;
+use kernel::prelude::*;
+use kernel::sync::{smutex::Mutex, Arc};
+
+const DEBUG_CLASS: DebugFlags = DebugFlags::Buffer;
+
+/// There are 127 GPU/firmware-side buffer manager slots (yes, 127, not 128).
+const NUM_BUFFERS: u32 = 127;
+
+/// Page size bits for buffer pages (32K). VAs must be aligned to this size.
+pub(crate) const PAGE_SHIFT: usize = 15;
+/// Page size for buffer pages.
+pub(crate) const PAGE_SIZE: usize = 1 << PAGE_SHIFT;
+/// Number of pages in a buffer block, which should be contiguous in VA space.
+pub(crate) const PAGES_PER_BLOCK: usize = 4;
+/// Size of a buffer block.
+pub(crate) const BLOCK_SIZE: usize = PAGE_SIZE * PAGES_PER_BLOCK;
+
+/// Metadata about the tiling configuration for a scene. This is computed in the `render` module.
+/// based on dimensions, tile size, and other info.
+pub(crate) struct TileInfo {
+    /// Tile count in the X dimension. Tiles are always 32x32.
+    pub(crate) tiles_x: u32,
+    /// Tile count in the Y dimension. Tiles are always 32x32.
+    pub(crate) tiles_y: u32,
+    /// Total tile count.
+    pub(crate) tiles: u32,
+    /// Micro-tile width (16 or 32).
+    pub(crate) utile_width: u32,
+    /// Micro-tile height (16 or 32).
+    pub(crate) utile_height: u32,
+    // Macro-tiles in the X dimension. Always 4.
+    //pub(crate) mtiles_x: u32,
+    // Macro-tiles in the Y dimension. Always 4.
+    //pub(crate) mtiles_y: u32,
+    /// Tiles per macro-tile in the X dimension.
+    pub(crate) tiles_per_mtile_x: u32,
+    /// Tiles per macro-tile in the Y dimension.
+    pub(crate) tiles_per_mtile_y: u32,
+    // Total tiles per macro-tile.
+    //pub(crate) tiles_per_mtile: u32,
+    /// Micro-tiles per macro-tile in the X dimension.
+    pub(crate) utiles_per_mtile_x: u32,
+    /// Micro-tiles per macro-tile in the Y dimension.
+    pub(crate) utiles_per_mtile_y: u32,
+    // Total micro-tiles per macro-tile.
+    //pub(crate) utiles_per_mtile: u32,
+    /// Size of the top-level tilemap, in bytes (for all layers, one cluster).
+    pub(crate) tilemap_size: usize,
+    /// Size of the Tail Pointer Cache, in bytes (for all layers * clusters).
+    pub(crate) tpc_size: usize,
+    /// Number of blocks in the clustering meta buffer (for clustering).
+    pub(crate) meta1_blocks: u32,
+    /// Minimum number of TVB blocks for this render.
+    pub(crate) min_tvb_blocks: usize,
+    /// XXX: Allocation factor for cluster tilemaps and meta4. Always 2?
+    pub(crate) cluster_factor: usize,
+    /// Tiling parameter structure passed to firmware.
+    pub(crate) params: fw::vertex::raw::TilingParameters,
+}
+
+/// A single scene, representing a render pass and its required buffers.
+#[versions(AGX)]
+#[derive(Debug)]
+pub(crate) struct Scene {
+    object: GpuObject<buffer::Scene::ver>,
+    slot: u32,
+    rebind: bool,
+    preempt2_off: usize,
+    preempt3_off: usize,
+    // Note: these are dead code only on some version variants.
+    // It's easier to do this than to propagate the version conditionals everywhere.
+    #[allow(dead_code)]
+    meta2_off: usize,
+    #[allow(dead_code)]
+    meta3_off: usize,
+    #[allow(dead_code)]
+    meta4_off: usize,
+}
+
+#[versions(AGX)]
+impl Scene::ver {
+    /// Returns true if the buffer was bound to a fresh manager slot, and therefore needs an init
+    /// command before a render.
+    pub(crate) fn rebind(&self) -> bool {
+        self.rebind
+    }
+
+    /// Returns the buffer manager slot this scene's buffer was bound to.
+    pub(crate) fn slot(&self) -> u32 {
+        self.slot
+    }
+
+    /// Returns the GPU pointer to the [`buffer::Scene::ver`].
+    pub(crate) fn gpu_pointer(&self) -> GpuPointer<'_, buffer::Scene::ver> {
+        self.object.gpu_pointer()
+    }
+
+    /// Returns the GPU weak pointer to the [`buffer::Scene::ver`].
+    pub(crate) fn weak_pointer(&self) -> GpuWeakPointer<buffer::Scene::ver> {
+        self.object.weak_pointer()
+    }
+
+    /// Returns the GPU weak pointer to the kernel-side temp buffer.
+    /// (purpose unknown...)
+    pub(crate) fn kernel_buffer_pointer(&self) -> GpuWeakPointer<[u8]> {
+        self.object.buffer.inner.lock().kernel_buffer.weak_pointer()
+    }
+
+    /// Returns the GPU pointer to the `buffer::Info::ver` object associated with this Scene.
+    pub(crate) fn buffer_pointer(&self) -> GpuPointer<'_, buffer::Info::ver> {
+        // We can't return the strong pointer directly since its lifetime crosses a lock, but we know
+        // its lifetime will be valid as long as &self since we hold a reference to the buffer,
+        // so just construct the strong pointer with the right lifetime here.
+        unsafe { self.weak_buffer_pointer().upgrade() }
+    }
+
+    /// Returns the GPU weak pointer to the `buffer::Info::ver` object associated with this Scene.
+    pub(crate) fn weak_buffer_pointer(&self) -> GpuWeakPointer<buffer::Info::ver> {
+        self.object.buffer.inner.lock().info.weak_pointer()
+    }
+
+    /// Returns the GPU pointer to the TVB heap metadata buffer.
+    pub(crate) fn tvb_heapmeta_pointer(&self) -> GpuPointer<'_, &'_ [u8]> {
+        self.object.tvb_heapmeta.gpu_pointer()
+    }
+
+    /// Returns the GPU pointer to the top-level TVB tilemap buffer.
+    pub(crate) fn tvb_tilemap_pointer(&self) -> GpuPointer<'_, &'_ [u8]> {
+        self.object.tvb_tilemap.gpu_pointer()
+    }
+
+    /// Returns the GPU pointer to the Tail Pointer Cache buffer.
+    pub(crate) fn tpc_pointer(&self) -> GpuPointer<'_, &'_ [u8]> {
+        self.object.tpc.gpu_pointer()
+    }
+
+    /// Returns the GPU pointer to the first preemption scratch buffer.
+    pub(crate) fn preempt_buf_1_pointer(&self) -> GpuPointer<'_, &'_ [u8]> {
+        self.object.preempt_buf.gpu_pointer()
+    }
+
+    /// Returns the GPU pointer to the second preemption scratch buffer.
+    pub(crate) fn preempt_buf_2_pointer(&self) -> GpuPointer<'_, &'_ [u8]> {
+        self.object
+            .preempt_buf
+            .gpu_offset_pointer(self.preempt2_off)
+    }
+
+    /// Returns the GPU pointer to the third preemption scratch buffer.
+    pub(crate) fn preempt_buf_3_pointer(&self) -> GpuPointer<'_, &'_ [u8]> {
+        self.object
+            .preempt_buf
+            .gpu_offset_pointer(self.preempt3_off)
+    }
+
+    /// Returns the GPU pointer to the per-cluster tilemap buffer, if clustering is enabled.
+    #[allow(dead_code)]
+    pub(crate) fn cluster_tilemaps_pointer(&self) -> Option<GpuPointer<'_, &'_ [u8]>> {
+        self.object
+            .clustering
+            .as_ref()
+            .map(|c| c.tilemaps.gpu_pointer())
+    }
+
+    /// Returns the GPU pointer to the clustering metadata 1 buffer, if clustering is enabled.
+    #[allow(dead_code)]
+    pub(crate) fn meta_1_pointer(&self) -> Option<GpuPointer<'_, &'_ [u8]>> {
+        self.object
+            .clustering
+            .as_ref()
+            .map(|c| c.meta.gpu_pointer())
+    }
+
+    /// Returns the GPU pointer to the clustering metadata 2 buffer, if clustering is enabled.
+    #[allow(dead_code)]
+    pub(crate) fn meta_2_pointer(&self) -> Option<GpuPointer<'_, &'_ [u8]>> {
+        self.object
+            .clustering
+            .as_ref()
+            .map(|c| c.meta.gpu_offset_pointer(self.meta2_off))
+    }
+
+    /// Returns the GPU pointer to the clustering metadata 3 buffer, if clustering is enabled.
+    #[allow(dead_code)]
+    pub(crate) fn meta_3_pointer(&self) -> Option<GpuPointer<'_, &'_ [u8]>> {
+        self.object
+            .clustering
+            .as_ref()
+            .map(|c| c.meta.gpu_offset_pointer(self.meta3_off))
+    }
+
+    /// Returns the GPU pointer to the clustering metadata 4 buffer, if clustering is enabled.
+    #[allow(dead_code)]
+    pub(crate) fn meta_4_pointer(&self) -> Option<GpuPointer<'_, &'_ [u8]>> {
+        self.object
+            .clustering
+            .as_ref()
+            .map(|c| c.meta.gpu_offset_pointer(self.meta4_off))
+    }
+
+    /// Returns the GPU pointer to an unknown buffer with incrementing numbers.
+    pub(crate) fn seq_buf_pointer(&self) -> GpuPointer<'_, &'_ [u64]> {
+        self.object.seq_buf.gpu_pointer()
+    }
+
+    /// Returns the number of TVB bytes used for this scene.
+    pub(crate) fn used_bytes(&self) -> usize {
+        self.object
+            .with(|raw, _inner| raw.total_page_count.load(Ordering::Relaxed) as usize * PAGE_SIZE)
+    }
+
+    /// Returns whether the TVB overflowed while rendering this scene.
+    pub(crate) fn overflowed(&self) -> bool {
+        self.object.with(|raw, _inner| {
+            raw.total_page_count.load(Ordering::Relaxed)
+                > raw.pass_page_count.load(Ordering::Relaxed)
+        })
+    }
+}
+
+#[versions(AGX)]
+impl Drop for Scene::ver {
+    fn drop(&mut self) {
+        let mut inner = self.object.buffer.inner.lock();
+        assert_ne!(inner.active_scenes, 0);
+        inner.active_scenes -= 1;
+
+        if inner.active_scenes == 0 {
+            mod_pr_debug!(
+                "Buffer: no scenes left, dropping slot {}",
+                inner.active_slot.take().unwrap().slot()
+            );
+            inner.active_slot = None;
+        }
+    }
+}
+
+/// Inner data for a single TVB buffer object.
+#[versions(AGX)]
+struct BufferInner {
+    info: GpuObject<buffer::Info::ver>,
+    ualloc: Arc<Mutex<alloc::DefaultAllocator>>,
+    ualloc_priv: Arc<Mutex<alloc::DefaultAllocator>>,
+    blocks: Vec<GpuOnlyArray<u8>>,
+    max_blocks: usize,
+    max_blocks_nomemless: usize,
+    mgr: BufferManager,
+    active_scenes: usize,
+    active_slot: Option<slotalloc::Guard<()>>,
+    last_token: Option<slotalloc::SlotToken>,
+    tpc: Option<Arc<GpuArray<u8>>>,
+    kernel_buffer: GpuArray<u8>,
+    stats: GpuObject<buffer::Stats>,
+    preempt1_size: usize,
+    preempt2_size: usize,
+    preempt3_size: usize,
+    num_clusters: usize,
+}
+
+/// Locked and reference counted TVB buffer.
+#[versions(AGX)]
+pub(crate) struct Buffer {
+    inner: Arc<Mutex<BufferInner::ver>>,
+}
+
+#[versions(AGX)]
+impl Buffer::ver {
+    /// Create a new Buffer for a given VM, given the per-VM allocators.
+    pub(crate) fn new(
+        gpu: &dyn gpu::GpuManager,
+        alloc: &mut gpu::KernelAllocators,
+        ualloc: Arc<Mutex<alloc::DefaultAllocator>>,
+        ualloc_priv: Arc<Mutex<alloc::DefaultAllocator>>,
+        mgr: &BufferManager,
+    ) -> Result<Buffer::ver> {
+        // These are the typical max numbers on macOS.
+        // 8GB machines have this halved.
+        let max_size: usize = 862_322_688; // bytes
+        let max_size_nomemless = max_size / 3;
+
+        let max_blocks = max_size / BLOCK_SIZE;
+        let max_blocks_nomemless = max_size_nomemless / BLOCK_SIZE;
+        let max_pages = max_blocks * PAGES_PER_BLOCK;
+        let max_pages_nomemless = max_blocks_nomemless * PAGES_PER_BLOCK;
+
+        let num_clusters = gpu.get_dyncfg().id.num_clusters as usize;
+        let num_clusters_adj = if num_clusters > 1 {
+            num_clusters + 1
+        } else {
+            1
+        };
+
+        let preempt1_size = num_clusters_adj * gpu.get_cfg().preempt1_size;
+        let preempt2_size = num_clusters_adj * gpu.get_cfg().preempt2_size;
+        let preempt3_size = num_clusters_adj * gpu.get_cfg().preempt3_size;
+
+        let inner = box_in_place!(buffer::Info::ver {
+            block_ctl: alloc.shared.new_default::<buffer::BlockControl>()?,
+            counter: alloc.shared.new_default::<buffer::Counter>()?,
+            page_list: ualloc_priv.lock().array_empty(max_pages)?,
+            block_list: ualloc_priv.lock().array_empty(max_blocks * 2)?,
+        })?;
+
+        let info = alloc.private.new_boxed(inner, |inner, ptr| {
+            Ok(place!(
+                ptr,
+                buffer::raw::Info::ver {
+                    gpu_counter: 0x0,
+                    unk_4: 0,
+                    last_id: 0x0,
+                    cur_id: -1,
+                    unk_10: 0x0,
+                    gpu_counter2: 0x0,
+                    unk_18: 0x0,
+                    #[ver(V < V13_0B4)]
+                    unk_1c: 0x0,
+                    page_list: inner.page_list.gpu_pointer(),
+                    page_list_size: (4 * max_pages).try_into()?,
+                    page_count: AtomicU32::new(0),
+                    max_blocks: max_blocks.try_into()?,
+                    block_count: AtomicU32::new(0),
+                    unk_38: 0x0,
+                    block_list: inner.block_list.gpu_pointer(),
+                    block_ctl: inner.block_ctl.gpu_pointer(),
+                    last_page: AtomicU32::new(0),
+                    gpu_page_ptr1: 0x0,
+                    gpu_page_ptr2: 0x0,
+                    unk_58: 0x0,
+                    block_size: BLOCK_SIZE as u32,
+                    unk_60: U64(0x0),
+                    counter: inner.counter.gpu_pointer(),
+                    unk_70: 0x0,
+                    unk_74: 0x0,
+                    unk_78: 0x0,
+                    unk_7c: 0x0,
+                    unk_80: 0x1,
+                    max_pages: max_pages.try_into()?,
+                    max_pages_nomemless: max_pages_nomemless.try_into()?,
+                    unk_8c: 0x0,
+                    unk_90: Default::default(),
+                }
+            ))
+        })?;
+
+        // Technically similar to Scene below, let's play it safe.
+        let kernel_buffer = alloc.shared.array_empty(0x40)?;
+        let stats = alloc
+            .shared
+            .new_object(Default::default(), |_inner| buffer::raw::Stats {
+                reset: AtomicU32::from(1),
+                ..Default::default()
+            })?;
+
+        Ok(Buffer::ver {
+            inner: Arc::try_new(Mutex::new(BufferInner::ver {
+                info,
+                ualloc,
+                ualloc_priv,
+                blocks: Vec::new(),
+                max_blocks,
+                max_blocks_nomemless,
+                mgr: mgr.clone(),
+                active_scenes: 0,
+                active_slot: None,
+                last_token: None,
+                tpc: None,
+                kernel_buffer,
+                stats,
+                preempt1_size,
+                preempt2_size,
+                preempt3_size,
+                num_clusters,
+            }))?,
+        })
+    }
+
+    /// Returns the total block count allocated to this Buffer.
+    pub(crate) fn block_count(&self) -> u32 {
+        self.inner.lock().blocks.len() as u32
+    }
+
+    /// Returns the total size in bytes allocated to this Buffer.
+    pub(crate) fn size(&self) -> usize {
+        self.block_count() as usize * BLOCK_SIZE
+    }
+
+    /// Automatically grow the Buffer based on feedback from the statistics.
+    pub(crate) fn auto_grow(&self) -> Result<bool> {
+        let inner = self.inner.lock();
+
+        let used_pages = inner.stats.with(|raw, _inner| {
+            let used = raw.max_pages.load(Ordering::Relaxed);
+            raw.reset.store(1, Ordering::Release);
+            used as usize
+        });
+
+        let need_blocks = div_ceil(used_pages * 2, PAGES_PER_BLOCK).min(inner.max_blocks_nomemless);
+        let want_blocks = div_ceil(used_pages * 3, PAGES_PER_BLOCK).min(inner.max_blocks_nomemless);
+
+        let cur_count = inner.blocks.len();
+
+        if need_blocks <= cur_count {
+            Ok(false)
+        } else {
+            // Grow to 3x requested size (same logic as macOS)
+            core::mem::drop(inner);
+            self.ensure_blocks(want_blocks)?;
+            Ok(true)
+        }
+    }
+
+    /// Ensure that the buffer has at least a certain minimum size in blocks.
+    pub(crate) fn ensure_blocks(&self, min_blocks: usize) -> Result<bool> {
+        let mut inner = self.inner.lock();
+
+        let cur_count = inner.blocks.len();
+        if cur_count >= min_blocks {
+            return Ok(false);
+        }
+        if min_blocks > inner.max_blocks {
+            return Err(ENOMEM);
+        }
+
+        let add_blocks = min_blocks - cur_count;
+        let new_count = min_blocks;
+
+        let mut new_blocks: Vec<GpuOnlyArray<u8>> = Vec::new();
+
+        // Allocate the new blocks first, so if it fails they will be dropped
+        let mut ualloc = inner.ualloc.lock();
+        for _i in 0..add_blocks {
+            new_blocks.try_push(ualloc.array_gpuonly(BLOCK_SIZE)?)?;
+        }
+        core::mem::drop(ualloc);
+
+        // Then actually commit them
+        inner.blocks.try_reserve(add_blocks)?;
+
+        for (i, block) in new_blocks.into_iter().enumerate() {
+            let page_num = (block.gpu_va().get() >> PAGE_SHIFT) as u32;
+
+            inner
+                .blocks
+                .try_push(block)
+                .expect("try_push() failed after try_reserve()");
+            inner.info.block_list[2 * (cur_count + i)] = page_num;
+            for j in 0..PAGES_PER_BLOCK {
+                inner.info.page_list[(cur_count + i) * PAGES_PER_BLOCK + j] = page_num + j as u32;
+            }
+        }
+
+        inner.info.block_ctl.with(|raw, _inner| {
+            raw.total.store(new_count as u32, Ordering::SeqCst);
+            raw.wptr.store(new_count as u32, Ordering::SeqCst);
+        });
+
+        let page_count = (new_count * PAGES_PER_BLOCK) as u32;
+        inner.info.with(|raw, _inner| {
+            raw.page_count.store(page_count, Ordering::Relaxed);
+            raw.block_count.store(new_count as u32, Ordering::Relaxed);
+            raw.last_page.store(page_count - 1, Ordering::Relaxed);
+        });
+
+        Ok(true)
+    }
+
+    /// Create a new [`Scene::ver`] (render pass) using this buffer.
+    pub(crate) fn new_scene(
+        &self,
+        alloc: &mut gpu::KernelAllocators,
+        tile_info: &TileInfo,
+    ) -> Result<Scene::ver> {
+        let mut inner = self.inner.lock();
+
+        let tilemap_size = tile_info.tilemap_size;
+        let tpc_size = tile_info.tpc_size;
+
+        // TODO: what is this exactly?
+        mod_pr_debug!("Buffer: Allocating TVB buffers\n");
+
+        // This seems to be a list, with 4x2 bytes of headers and 8 bytes per entry.
+        // On single-cluster devices, the used length always seems to be 1.
+        // On M1 Ultra, it can grow and usually doesn't exceed 8 * cluster_factor
+        // entries. macOS allocates a whole 64K * 0x80 for this, so let's go with
+        // that to be safe...
+        let user_buffer = inner.ualloc.lock().array_empty(if inner.num_clusters > 1 {
+            0x10080
+        } else {
+            0x80
+        })?;
+
+        let tvb_heapmeta = inner.ualloc.lock().array_empty(0x200)?;
+        let tvb_tilemap = inner.ualloc.lock().array_empty(tilemap_size)?;
+
+        mod_pr_debug!("Buffer: Allocating misc buffers\n");
+        let preempt_buf = inner
+            .ualloc
+            .lock()
+            .array_empty(inner.preempt1_size + inner.preempt2_size + inner.preempt3_size)?;
+
+        let mut seq_buf = inner.ualloc.lock().array_empty(0x800)?;
+        for i in 1..0x400 {
+            seq_buf[i] = (i + 1) as u64;
+        }
+
+        let tpc = match inner.tpc.as_ref() {
+            Some(buf) if buf.len() >= tpc_size => buf.clone(),
+            _ => {
+                // MacOS allocates this as shared GPU+FW, but
+                // priv seems to work and might be faster?
+                // Needs to be FW-writable anyway, so ualloc
+                // won't work.
+                let buf = Arc::try_new(
+                    inner
+                        .ualloc_priv
+                        .lock()
+                        .array_empty((tpc_size + mmu::UAT_PGMSK) & !mmu::UAT_PGMSK)?,
+                )?;
+                inner.tpc = Some(buf.clone());
+                buf
+            }
+        };
+
+        // Maybe: (4x4 macro tiles + 1 global page)*n, 32bit each (17*4*n)
+        let meta1_size = align(tile_info.meta1_blocks as usize * 0x44, 0x80);
+        // check
+        let meta2_size = align(0x190 * inner.num_clusters, 0x80);
+        let meta3_size = align(0x280 * inner.num_clusters, 0x80);
+        // Like user_buffer for single-cluster modes, 0x30 per cluster * the cluster
+        // factor.
+        let meta4_size = align(0x30 * inner.num_clusters * tile_info.cluster_factor, 0x80);
+        let meta_size = meta1_size + meta2_size + meta3_size + meta4_size;
+
+        let clustering = if inner.num_clusters > 1 {
+            mod_pr_debug!("Buffer: Allocating clustering buffers\n");
+            let tilemaps = inner
+                .ualloc
+                .lock()
+                .array_empty(inner.num_clusters * tilemap_size * tile_info.cluster_factor)?;
+            let meta = inner.ualloc.lock().array_empty(meta_size)?;
+            Some(buffer::ClusterBuffers { tilemaps, meta })
+        } else {
+            None
+        };
+
+        let scene_inner = box_in_place!(buffer::Scene::ver {
+            user_buffer: user_buffer,
+            buffer: self.clone(),
+            tvb_heapmeta: tvb_heapmeta,
+            tvb_tilemap: tvb_tilemap,
+            tpc: tpc,
+            clustering: clustering,
+            preempt_buf: preempt_buf,
+            seq_buf: seq_buf,
+        })?;
+
+        // Could be made strong, but we wind up with a deadlock if we try to grab the
+        // pointer through the inner.buffer path inside the closure.
+        let stats_pointer = inner.stats.weak_pointer();
+
+        // macOS allocates this as private. However, the firmware does not
+        // DC CIVAC this before reading it (like it does most other things),
+        // which causes odd cache incoherency bugs when combined with
+        // speculation on the firmware side (maybe). This doesn't happen
+        // on macOS because these structs are a circular pool that is mapped
+        // already initialized. Just mark this shared for now.
+        let scene = alloc.shared.new_boxed(scene_inner, |inner, ptr| {
+            Ok(place!(
+                ptr,
+                buffer::raw::Scene {
+                    pass_page_count: AtomicU32::new(0),
+                    unk_4: 0,
+                    unk_8: U64(0),
+                    unk_10: U64(0),
+                    user_buffer: inner.user_buffer.gpu_pointer(),
+                    unk_20: 0,
+                    stats: stats_pointer,
+                    total_page_count: AtomicU32::new(0),
+                    unk_30: U64(0),
+                    unk_38: U64(0),
+                }
+            ))
+        })?;
+
+        let mut rebind = false;
+
+        if inner.active_slot.is_none() {
+            assert_eq!(inner.active_scenes, 0);
+
+            let slot = inner.mgr.0.get(inner.last_token)?;
+            rebind = slot.changed();
+
+            mod_pr_debug!("Buffer: assigning slot {} (rebind={})", slot.slot(), rebind);
+
+            inner.last_token = Some(slot.token());
+            inner.active_slot = Some(slot);
+        }
+
+        inner.active_scenes += 1;
+
+        Ok(Scene::ver {
+            object: scene,
+            slot: inner.active_slot.as_ref().unwrap().slot(),
+            rebind,
+            preempt2_off: inner.preempt1_size,
+            preempt3_off: inner.preempt1_size + inner.preempt2_size,
+            meta2_off: meta1_size,
+            meta3_off: meta1_size + meta2_size,
+            meta4_off: meta1_size + meta2_size + meta3_size,
+        })
+    }
+
+    /// Increment the buffer manager usage count. Should we done once we know the Scene is ready
+    /// to be committed and used in commands submitted to the GPU.
+    pub(crate) fn increment(&self) {
+        let inner = self.inner.lock();
+        inner.info.counter.with(|raw, _inner| {
+            // We could use fetch_add, but the non-LSE atomic
+            // sequence Rust produces confuses the hypervisor.
+            // We have inner locked anyway, so this is not racy.
+            let v = raw.count.load(Ordering::Relaxed);
+            raw.count.store(v + 1, Ordering::Relaxed);
+        });
+    }
+}
+
+#[versions(AGX)]
+impl Clone for Buffer::ver {
+    fn clone(&self) -> Self {
+        Buffer::ver {
+            inner: self.inner.clone(),
+        }
+    }
+}
+
+/// The GPU-global buffer manager, used to allocate and release buffer slots from the pool.
+pub(crate) struct BufferManager(slotalloc::SlotAllocator<()>);
+
+impl BufferManager {
+    pub(crate) fn new() -> Result<BufferManager> {
+        Ok(BufferManager(slotalloc::SlotAllocator::new(
+            NUM_BUFFERS,
+            (),
+            |_inner, _slot| (),
+        )?))
+    }
+}
+
+impl Clone for BufferManager {
+    fn clone(&self) -> Self {
+        BufferManager(self.0.clone())
+    }
+}
diff --git a/drivers/gpu/drm/asahi/channel.rs b/drivers/gpu/drm/asahi/channel.rs
new file mode 100644
index 000000000000..0b3c3b65c279
--- /dev/null
+++ b/drivers/gpu/drm/asahi/channel.rs
@@ -0,0 +1,542 @@
+// SPDX-License-Identifier: GPL-2.0-only OR MIT
+
+//! GPU ring buffer channels
+//!
+//! The GPU firmware use a set of ring buffer channels to receive commands from the driver and send
+//! it notifications and status messages.
+//!
+//! These ring buffers mostly follow uniform conventions, so they share the same base
+//! implementation.
+
+use crate::debug::*;
+use crate::driver::AsahiDevice;
+use crate::fw::channels::*;
+use crate::fw::initdata::{raw, ChannelRing};
+use crate::fw::types::*;
+use crate::{event, gpu, mem};
+use core::time::Duration;
+use kernel::{c_str, delay::coarse_sleep, prelude::*, sync::Arc, time};
+
+pub(crate) use crate::fw::channels::PipeType;
+
+/// A receive (FW->driver) channel.
+pub(crate) struct RxChannel<T: RxChannelState, U: Copy + Default>
+where
+    for<'a> <T as GpuStruct>::Raw<'a>: Debug + Default + Zeroed,
+{
+    ring: ChannelRing<T, U>,
+    // FIXME: needs feature(generic_const_exprs)
+    //rptr: [u32; T::SUB_CHANNELS],
+    rptr: [u32; 6],
+    count: u32,
+}
+
+impl<T: RxChannelState, U: Copy + Default> RxChannel<T, U>
+where
+    for<'a> <T as GpuStruct>::Raw<'a>: Debug + Default + Zeroed,
+{
+    /// Allocates a new receive channel with a given message count.
+    pub(crate) fn new(alloc: &mut gpu::KernelAllocators, count: usize) -> Result<RxChannel<T, U>> {
+        Ok(RxChannel {
+            ring: ChannelRing {
+                state: alloc.shared.new_default()?,
+                ring: alloc.shared.array_empty(T::SUB_CHANNELS * count)?,
+            },
+            rptr: Default::default(),
+            count: count as u32,
+        })
+    }
+
+    /// Receives a message on the specified sub-channel index, optionally leaving in the ring
+    /// buffer.
+    ///
+    /// Returns None if the channel is empty.
+    fn get_or_peek(&mut self, index: usize, peek: bool) -> Option<U> {
+        self.ring.state.with(|raw, _inner| {
+            let wptr = T::wptr(raw, index);
+            let rptr = &mut self.rptr[index];
+            if wptr == *rptr {
+                None
+            } else {
+                let off = self.count as usize * index;
+                let msg = self.ring.ring[off + *rptr as usize];
+                if !peek {
+                    *rptr = (*rptr + 1) % self.count;
+                    T::set_rptr(raw, index, *rptr);
+                }
+                Some(msg)
+            }
+        })
+    }
+
+    /// Receives a message on the specified sub-channel index, and dequeues it from the ring buffer.
+    ///
+    /// Returns None if the channel is empty.
+    pub(crate) fn get(&mut self, index: usize) -> Option<U> {
+        self.get_or_peek(index, false)
+    }
+
+    /// Peeks a message on the specified sub-channel index, leaving it in the ring buffer.
+    ///
+    /// Returns None if the channel is empty.
+    pub(crate) fn peek(&mut self, index: usize) -> Option<U> {
+        self.get_or_peek(index, true)
+    }
+}
+
+/// A transmit (driver->FW) channel.
+pub(crate) struct TxChannel<T: TxChannelState, U: Copy + Default>
+where
+    for<'a> <T as GpuStruct>::Raw<'a>: Debug + Default + Zeroed,
+{
+    ring: ChannelRing<T, U>,
+    wptr: u32,
+    count: u32,
+}
+
+impl<T: TxChannelState, U: Copy + Default> TxChannel<T, U>
+where
+    for<'a> <T as GpuStruct>::Raw<'a>: Debug + Default + Zeroed,
+{
+    /// Allocates a new cached transmit channel with a given message count.
+    pub(crate) fn new(alloc: &mut gpu::KernelAllocators, count: usize) -> Result<TxChannel<T, U>> {
+        Ok(TxChannel {
+            ring: ChannelRing {
+                state: alloc.shared.new_default()?,
+                ring: alloc.private.array_empty(count)?,
+            },
+            wptr: 0,
+            count: count as u32,
+        })
+    }
+
+    /// Allocates a new uncached transmit channel with a given message count.
+    pub(crate) fn new_uncached(
+        alloc: &mut gpu::KernelAllocators,
+        count: usize,
+    ) -> Result<TxChannel<T, U>> {
+        Ok(TxChannel {
+            ring: ChannelRing {
+                state: alloc.shared.new_default()?,
+                ring: alloc.shared.array_empty(count)?,
+            },
+            wptr: 0,
+            count: count as u32,
+        })
+    }
+
+    /// Send a message to the ring, returning a cookie with the ring buffer position.
+    ///
+    /// This will poll/block if the ring is full, which we don't really expect to happen.
+    pub(crate) fn put(&mut self, msg: &U) -> u32 {
+        self.ring.state.with(|raw, _inner| {
+            let next_wptr = (self.wptr + 1) % self.count;
+            let mut rptr = T::rptr(raw);
+            if next_wptr == rptr {
+                pr_err!(
+                    "TX ring buffer is full! Waiting... ({}, {})\n",
+                    next_wptr,
+                    rptr
+                );
+                // TODO: block properly on incoming messages?
+                while next_wptr == rptr {
+                    coarse_sleep(Duration::from_millis(8));
+                    rptr = T::rptr(raw);
+                }
+            }
+            self.ring.ring[self.wptr as usize] = *msg;
+            mem::sync();
+            T::set_wptr(raw, next_wptr);
+            self.wptr = next_wptr;
+        });
+        self.wptr
+    }
+
+    /// Wait for a previously submitted message to be popped off of the ring by the GPU firmware.
+    ///
+    /// This busy-loops, and is intended to be used for rare cases when we need to block for
+    /// completion of a cache management or invalidation operation synchronously (which
+    /// the firmware normally completes fast enough not to be worth sleeping for).
+    /// If the poll takes longer than 10ms, this switches to sleeping between polls.
+    pub(crate) fn wait_for(&mut self, wptr: u32, timeout_ms: u64) -> Result {
+        const MAX_FAST_POLL: u64 = 10;
+        let start = time::ktime_get();
+        let timeout_fast = start + Duration::from_millis(timeout_ms.min(MAX_FAST_POLL));
+        let timeout_slow = start + Duration::from_millis(timeout_ms);
+        self.ring.state.with(|raw, _inner| {
+            while time::ktime_get() < timeout_fast {
+                if T::rptr(raw) == wptr {
+                    return Ok(());
+                }
+                mem::sync();
+            }
+            while time::ktime_get() < timeout_slow {
+                if T::rptr(raw) == wptr {
+                    return Ok(());
+                }
+                coarse_sleep(Duration::from_millis(5));
+                mem::sync();
+            }
+            Err(ETIMEDOUT)
+        })
+    }
+}
+
+/// Device Control channel for global device management commands.
+#[versions(AGX)]
+pub(crate) struct DeviceControlChannel {
+    dev: AsahiDevice,
+    ch: TxChannel<ChannelState, DeviceControlMsg::ver>,
+}
+
+#[versions(AGX)]
+impl DeviceControlChannel::ver {
+    const COMMAND_TIMEOUT_MS: u64 = 1000;
+
+    /// Allocate a new Device Control channel.
+    pub(crate) fn new(
+        dev: &AsahiDevice,
+        alloc: &mut gpu::KernelAllocators,
+    ) -> Result<DeviceControlChannel::ver> {
+        Ok(DeviceControlChannel::ver {
+            dev: dev.clone(),
+            ch: TxChannel::<ChannelState, DeviceControlMsg::ver>::new(alloc, 0x100)?,
+        })
+    }
+
+    /// Returns the raw `ChannelRing` structure to pass to firmware.
+    pub(crate) fn to_raw(&self) -> raw::ChannelRing<ChannelState, DeviceControlMsg::ver> {
+        self.ch.ring.to_raw()
+    }
+
+    /// Submits a Device Control command.
+    pub(crate) fn send(&mut self, msg: &DeviceControlMsg::ver) -> u32 {
+        cls_dev_dbg!(DeviceControlCh, self.dev, "DeviceControl: {:?}\n", msg);
+        self.ch.put(msg)
+    }
+
+    /// Waits for a previously submitted Device Control command to complete.
+    pub(crate) fn wait_for(&mut self, wptr: u32) -> Result {
+        self.ch.wait_for(wptr, Self::COMMAND_TIMEOUT_MS)
+    }
+}
+
+/// Pipe channel to submit WorkQueue execution requests.
+#[versions(AGX)]
+pub(crate) struct PipeChannel {
+    dev: AsahiDevice,
+    ch: TxChannel<ChannelState, PipeMsg::ver>,
+}
+
+#[versions(AGX)]
+impl PipeChannel::ver {
+    /// Allocate a new Pipe submission channel.
+    pub(crate) fn new(
+        dev: &AsahiDevice,
+        alloc: &mut gpu::KernelAllocators,
+    ) -> Result<PipeChannel::ver> {
+        Ok(PipeChannel::ver {
+            dev: dev.clone(),
+            ch: TxChannel::<ChannelState, PipeMsg::ver>::new(alloc, 0x100)?,
+        })
+    }
+
+    /// Returns the raw `ChannelRing` structure to pass to firmware.
+    pub(crate) fn to_raw(&self) -> raw::ChannelRing<ChannelState, PipeMsg::ver> {
+        self.ch.ring.to_raw()
+    }
+
+    /// Submits a Pipe kick command to the firmware.
+    pub(crate) fn send(&mut self, msg: &PipeMsg::ver) {
+        cls_dev_dbg!(PipeCh, self.dev, "Pipe: {:?}\n", msg);
+        self.ch.put(msg);
+    }
+}
+
+/// Firmware Control channel, used for secure cache flush requests.
+pub(crate) struct FwCtlChannel {
+    dev: AsahiDevice,
+    ch: TxChannel<FwCtlChannelState, FwCtlMsg>,
+}
+
+impl FwCtlChannel {
+    const COMMAND_TIMEOUT_MS: u64 = 1000;
+
+    /// Allocate a new Firmware Control channel.
+    pub(crate) fn new(
+        dev: &AsahiDevice,
+        alloc: &mut gpu::KernelAllocators,
+    ) -> Result<FwCtlChannel> {
+        Ok(FwCtlChannel {
+            dev: dev.clone(),
+            ch: TxChannel::<FwCtlChannelState, FwCtlMsg>::new_uncached(alloc, 0x100)?,
+        })
+    }
+
+    /// Returns the raw `ChannelRing` structure to pass to firmware.
+    pub(crate) fn to_raw(&self) -> raw::ChannelRing<FwCtlChannelState, FwCtlMsg> {
+        self.ch.ring.to_raw()
+    }
+
+    /// Submits a Firmware Control command to the firmware.
+    pub(crate) fn send(&mut self, msg: &FwCtlMsg) -> u32 {
+        cls_dev_dbg!(FwCtlCh, self.dev, "FwCtl: {:?}\n", msg);
+        self.ch.put(msg)
+    }
+
+    /// Waits for a previously submitted Firmware Control command to complete.
+    pub(crate) fn wait_for(&mut self, wptr: u32) -> Result {
+        self.ch.wait_for(wptr, Self::COMMAND_TIMEOUT_MS)
+    }
+}
+
+/// Event channel, used to notify the driver of command completions, GPU faults and errors, and
+/// other events.
+pub(crate) struct EventChannel {
+    dev: AsahiDevice,
+    ch: RxChannel<ChannelState, RawEventMsg>,
+    mgr: Arc<event::EventManager>,
+    gpu: Option<Arc<dyn gpu::GpuManager>>,
+}
+
+impl EventChannel {
+    /// Allocate a new Event channel.
+    pub(crate) fn new(
+        dev: &AsahiDevice,
+        alloc: &mut gpu::KernelAllocators,
+        mgr: Arc<event::EventManager>,
+    ) -> Result<EventChannel> {
+        Ok(EventChannel {
+            dev: dev.clone(),
+            ch: RxChannel::<ChannelState, RawEventMsg>::new(alloc, 0x100)?,
+            mgr,
+            gpu: None,
+        })
+    }
+
+    /// Registers the managing `Gpu` instance that will handle events on this channel.
+    pub(crate) fn set_manager(&mut self, gpu: Arc<dyn gpu::GpuManager>) {
+        self.gpu = Some(gpu);
+    }
+
+    /// Returns the raw `ChannelRing` structure to pass to firmware.
+    pub(crate) fn to_raw(&self) -> raw::ChannelRing<ChannelState, RawEventMsg> {
+        self.ch.ring.to_raw()
+    }
+
+    /// Polls for new Event messages on this ring.
+    pub(crate) fn poll(&mut self) {
+        while let Some(msg) = self.ch.get(0) {
+            let tag = unsafe { msg.raw.0 };
+            match tag {
+                0..=EVENT_MAX => {
+                    let msg = unsafe { msg.msg };
+
+                    cls_dev_dbg!(EventCh, self.dev, "Event: {:?}\n", msg);
+                    match msg {
+                        EventMsg::Fault => match self.gpu.as_ref() {
+                            Some(gpu) => gpu.handle_fault(),
+                            None => {
+                                dev_crit!(self.dev, "EventChannel: No GPU manager available!\n")
+                            }
+                        },
+                        EventMsg::Timeout {
+                            counter,
+                            event_slot,
+                            ..
+                        } => match self.gpu.as_ref() {
+                            Some(gpu) => gpu.handle_timeout(counter, event_slot),
+                            None => {
+                                dev_crit!(self.dev, "EventChannel: No GPU manager available!\n")
+                            }
+                        },
+                        EventMsg::Flag { firing, .. } => {
+                            for (i, flags) in firing.iter().enumerate() {
+                                for j in 0..32 {
+                                    if flags & (1u32 << j) != 0 {
+                                        self.mgr.signal((i * 32 + j) as u32);
+                                    }
+                                }
+                            }
+                        }
+                        msg => {
+                            dev_crit!(self.dev, "Unknown event message: {:?}\n", msg);
+                        }
+                    }
+                }
+                _ => {
+                    dev_warn!(self.dev, "Unknown event message: {:?}\n", unsafe {
+                        msg.raw
+                    });
+                }
+            }
+        }
+    }
+}
+
+/// Firmware Log channel. This one is pretty special, since it has 6 sub-channels (for different log
+/// levels), and it also uses a side buffer to actually hold the log messages, only passing around
+/// pointers in the main buffer.
+pub(crate) struct FwLogChannel {
+    dev: AsahiDevice,
+    ch: RxChannel<FwLogChannelState, RawFwLogMsg>,
+    payload_buf: GpuArray<RawFwLogPayloadMsg>,
+}
+
+impl FwLogChannel {
+    const RING_SIZE: usize = 0x100;
+    const BUF_SIZE: usize = 0x100;
+
+    /// Allocate a new Firmware Log channel.
+    pub(crate) fn new(
+        dev: &AsahiDevice,
+        alloc: &mut gpu::KernelAllocators,
+    ) -> Result<FwLogChannel> {
+        Ok(FwLogChannel {
+            dev: dev.clone(),
+            ch: RxChannel::<FwLogChannelState, RawFwLogMsg>::new(alloc, Self::RING_SIZE)?,
+            payload_buf: alloc
+                .shared
+                .array_empty(Self::BUF_SIZE * FwLogChannelState::SUB_CHANNELS)?,
+        })
+    }
+
+    /// Returns the raw `ChannelRing` structure to pass to firmware.
+    pub(crate) fn to_raw(&self) -> raw::ChannelRing<FwLogChannelState, RawFwLogMsg> {
+        self.ch.ring.to_raw()
+    }
+
+    /// Returns the GPU pointers to the firmware log payload buffer.
+    pub(crate) fn get_buf(&self) -> GpuWeakPointer<[RawFwLogPayloadMsg]> {
+        self.payload_buf.weak_pointer()
+    }
+
+    /// Polls for new log messages on all sub-rings.
+    pub(crate) fn poll(&mut self) {
+        for i in 0..=FwLogChannelState::SUB_CHANNELS - 1 {
+            while let Some(msg) = self.ch.peek(i) {
+                cls_dev_dbg!(FwLogCh, self.dev, "FwLog{}: {:?}\n", i, msg);
+                if msg.msg_type != 2 {
+                    dev_warn!(self.dev, "Unknown FWLog{} message: {:?}\n", i, msg);
+                    self.ch.get(i);
+                    continue;
+                }
+                if msg.msg_index.0 as usize >= Self::BUF_SIZE {
+                    dev_warn!(
+                        self.dev,
+                        "FWLog{} message index out of bounds: {:?}\n",
+                        i,
+                        msg
+                    );
+                    self.ch.get(i);
+                    continue;
+                }
+                let index = Self::BUF_SIZE * i + msg.msg_index.0 as usize;
+                let payload = &self.payload_buf.as_slice()[index];
+                if payload.msg_type != 3 {
+                    dev_warn!(self.dev, "Unknown FWLog{} payload: {:?}\n", i, payload);
+                    self.ch.get(i);
+                    continue;
+                }
+                let msg = if let Some(end) = payload.msg.iter().position(|&r| r == 0) {
+                    CStr::from_bytes_with_nul(&(*payload.msg)[..end + 1])
+                        .unwrap_or(c_str!("cstr_err"))
+                } else {
+                    dev_warn!(
+                        self.dev,
+                        "FWLog{} payload not NUL-terminated: {:?}\n",
+                        i,
+                        payload
+                    );
+                    self.ch.get(i);
+                    continue;
+                };
+                match i {
+                    0 => dev_dbg!(self.dev, "FWLog: {}\n", msg),
+                    1 => dev_info!(self.dev, "FWLog: {}\n", msg),
+                    2 => dev_notice!(self.dev, "FWLog: {}\n", msg),
+                    3 => dev_warn!(self.dev, "FWLog: {}\n", msg),
+                    4 => dev_err!(self.dev, "FWLog: {}\n", msg),
+                    5 => dev_crit!(self.dev, "FWLog: {}\n", msg),
+                    _ => (),
+                };
+                self.ch.get(i);
+            }
+        }
+    }
+}
+
+pub(crate) struct KTraceChannel {
+    dev: AsahiDevice,
+    ch: RxChannel<ChannelState, RawKTraceMsg>,
+}
+
+/// KTrace channel, used to receive detailed execution trace markers from the firmware.
+/// We currently disable this in initdata, so no messages are expected here at this time.
+impl KTraceChannel {
+    /// Allocate a new KTrace channel.
+    pub(crate) fn new(
+        dev: &AsahiDevice,
+        alloc: &mut gpu::KernelAllocators,
+    ) -> Result<KTraceChannel> {
+        Ok(KTraceChannel {
+            dev: dev.clone(),
+            ch: RxChannel::<ChannelState, RawKTraceMsg>::new(alloc, 0x200)?,
+        })
+    }
+
+    /// Returns the raw `ChannelRing` structure to pass to firmware.
+    pub(crate) fn to_raw(&self) -> raw::ChannelRing<ChannelState, RawKTraceMsg> {
+        self.ch.ring.to_raw()
+    }
+
+    /// Polls for new KTrace messages on this ring.
+    pub(crate) fn poll(&mut self) {
+        while let Some(msg) = self.ch.get(0) {
+            cls_dev_dbg!(KTraceCh, self.dev, "KTrace: {:?}\n", msg);
+        }
+    }
+}
+
+/// Statistics channel, reporting power-related statistics to the driver.
+/// Not really implemented other than debug logs yet...
+#[versions(AGX)]
+pub(crate) struct StatsChannel {
+    dev: AsahiDevice,
+    ch: RxChannel<ChannelState, RawStatsMsg::ver>,
+}
+
+#[versions(AGX)]
+impl StatsChannel::ver {
+    /// Allocate a new Statistics channel.
+    pub(crate) fn new(
+        dev: &AsahiDevice,
+        alloc: &mut gpu::KernelAllocators,
+    ) -> Result<StatsChannel::ver> {
+        Ok(StatsChannel::ver {
+            dev: dev.clone(),
+            ch: RxChannel::<ChannelState, RawStatsMsg::ver>::new(alloc, 0x100)?,
+        })
+    }
+
+    /// Returns the raw `ChannelRing` structure to pass to firmware.
+    pub(crate) fn to_raw(&self) -> raw::ChannelRing<ChannelState, RawStatsMsg::ver> {
+        self.ch.ring.to_raw()
+    }
+
+    /// Polls for new statistics messages on this ring.
+    pub(crate) fn poll(&mut self) {
+        while let Some(msg) = self.ch.get(0) {
+            let tag = unsafe { msg.raw.0 };
+            match tag {
+                0..=STATS_MAX::ver => {
+                    let msg = unsafe { msg.msg };
+                    cls_dev_dbg!(StatsCh, self.dev, "Stats: {:?}\n", msg);
+                }
+                _ => {
+                    pr_warn!("Unknown stats message: {:?}\n", unsafe { msg.raw });
+                }
+            }
+        }
+    }
+}
diff --git a/drivers/gpu/drm/asahi/debug.rs b/drivers/gpu/drm/asahi/debug.rs
new file mode 100644
index 000000000000..2f3a70e04cfd
--- /dev/null
+++ b/drivers/gpu/drm/asahi/debug.rs
@@ -0,0 +1,129 @@
+// SPDX-License-Identifier: GPL-2.0-only OR MIT
+#![allow(dead_code)]
+
+//! Debug enable/disable flags and convenience macros
+
+#[allow(unused_imports)]
+pub(crate) use super::{cls_dev_dbg, cls_pr_debug, debug, mod_dev_dbg, mod_pr_debug};
+use core::sync::atomic::{AtomicU64, Ordering};
+
+static DEBUG_FLAGS: AtomicU64 = AtomicU64::new(0);
+
+/// Debug flag bit indices
+pub(crate) enum DebugFlags {
+    // 0-3: Memory-related debug
+    Mmu = 0,
+    Alloc = 1,
+    Gem = 2,
+    Object = 3,
+
+    // 4-7: Firmware objects and resources
+    Event = 4,
+    Buffer = 5,
+    WorkQueue = 6,
+
+    // 8-13: DRM interface, rendering, compute, GPU globals
+    Gpu = 8,
+    File = 9,
+    Queue = 10,
+    Render = 11,
+    Compute = 12,
+
+    // 14-15: Misc stats
+    MemStats = 14,
+    TVBStats = 15,
+
+    // 16-22: Channels
+    FwLogCh = 16,
+    KTraceCh = 17,
+    StatsCh = 18,
+    EventCh = 19,
+    PipeCh = 20,
+    DeviceControlCh = 21,
+    FwCtlCh = 22,
+
+    // 32-35: Allocator debugging
+    FillAllocations = 32,
+    DebugAllocations = 33,
+    DetectOverflows = 34,
+    ForceCPUMaps = 35,
+
+    // 36-: Behavior flags
+    ConservativeTlbi = 36,
+    KeepGpuPowered = 37,
+    WaitForPowerOff = 38,
+    NoGpuRecovery = 39,
+    DisableClustering = 40,
+
+    // 48-: Misc
+    Debug0 = 48,
+    Debug1 = 49,
+    Debug2 = 50,
+    Debug3 = 51,
+    Debug4 = 52,
+    Debug5 = 53,
+    Debug6 = 54,
+    Debug7 = 55,
+}
+
+/// Update the cached global debug flags from the module parameter
+pub(crate) fn update_debug_flags() {
+    let flags = {
+        let lock = crate::THIS_MODULE.kernel_param_lock();
+        *crate::debug_flags.read(&lock)
+    };
+
+    DEBUG_FLAGS.store(flags, Ordering::Relaxed);
+}
+
+/// Check whether debug is enabled for a given flag
+#[inline(always)]
+pub(crate) fn debug_enabled(flag: DebugFlags) -> bool {
+    DEBUG_FLAGS.load(Ordering::Relaxed) & 1 << (flag as usize) != 0
+}
+
+/// Run some code only if debug is enabled for the calling module
+#[macro_export]
+macro_rules! debug {
+    ($($arg:tt)*) => {
+        if $crate::debug::debug_enabled(DEBUG_CLASS) {
+            $($arg)*
+        }
+    };
+}
+
+/// pr_info!() if debug is enabled for the calling module
+#[macro_export]
+macro_rules! mod_pr_debug (
+    ($($arg:tt)*) => (
+        $crate::debug! { ::kernel::pr_info! ( $($arg)* ); }
+    )
+);
+
+/// dev_info!() if debug is enabled for the calling module
+#[macro_export]
+macro_rules! mod_dev_dbg (
+    ($($arg:tt)*) => (
+        $crate::debug! { ::kernel::dev_info! ( $($arg)* ); }
+    )
+);
+
+/// pr_info!() if debug is enabled for a specific module
+#[macro_export]
+macro_rules! cls_pr_debug (
+    ($cls:ident, $($arg:tt)*) => (
+        if $crate::debug::debug_enabled($crate::debug::DebugFlags::$cls) {
+            ::kernel::pr_info! ( $($arg)* );
+        }
+    )
+);
+
+/// dev_info!() if debug is enabled for a specific module
+#[macro_export]
+macro_rules! cls_dev_dbg (
+    ($cls:ident, $($arg:tt)*) => (
+        if $crate::debug::debug_enabled($crate::debug::DebugFlags::$cls) {
+            ::kernel::dev_info! ( $($arg)* );
+        }
+    )
+);
diff --git a/drivers/gpu/drm/asahi/driver.rs b/drivers/gpu/drm/asahi/driver.rs
new file mode 100644
index 000000000000..d49d8b1934a4
--- /dev/null
+++ b/drivers/gpu/drm/asahi/driver.rs
@@ -0,0 +1,166 @@
+// SPDX-License-Identifier: GPL-2.0-only OR MIT
+
+//! Top-level GPU driver implementation.
+
+use kernel::{
+    c_str, device, drm, drm::drv, drm::ioctl, error::Result, of, platform, prelude::*, sync::Arc,
+};
+
+use crate::{debug, file, gem, gpu, hw, regs};
+
+use kernel::device::RawDevice;
+use kernel::macros::vtable;
+
+/// Driver metadata
+const INFO: drv::DriverInfo = drv::DriverInfo {
+    major: 0,
+    minor: 0,
+    patchlevel: 0,
+    name: c_str!("asahi"),
+    desc: c_str!("Apple AGX Graphics"),
+    date: c_str!("20220831"),
+};
+
+/// Device data for the driver registration.
+///
+/// Holds a reference to the top-level `GpuManager` object.
+pub(crate) struct AsahiData {
+    pub(crate) dev: device::Device,
+    pub(crate) gpu: Arc<dyn gpu::GpuManager>,
+}
+
+/// Convenience type alias for the `device::Data` type for this driver.
+type DeviceData = device::Data<drv::Registration<AsahiDriver>, regs::Resources, AsahiData>;
+
+/// Empty struct representing this driver.
+pub(crate) struct AsahiDriver;
+
+/// Convenience type alias for the DRM device type for this driver.
+pub(crate) type AsahiDevice = kernel::drm::device::Device<AsahiDriver>;
+
+/// DRM Driver implementation for `AsahiDriver`.
+#[vtable]
+impl drv::Driver for AsahiDriver {
+    /// Our `DeviceData` type, reference-counted
+    type Data = Arc<DeviceData>;
+    /// Our `File` type.
+    type File = file::File;
+    /// Our `Object` type.
+    type Object = gem::Object;
+
+    const INFO: drv::DriverInfo = INFO;
+    const FEATURES: u32 =
+        drv::FEAT_GEM | drv::FEAT_RENDER | drv::FEAT_SYNCOBJ | drv::FEAT_SYNCOBJ_TIMELINE;
+
+    kernel::declare_drm_ioctls! {
+        (ASAHI_GET_PARAMS,      drm_asahi_get_params,
+                          ioctl::RENDER_ALLOW, file::File::get_params),
+        (ASAHI_VM_CREATE,       drm_asahi_vm_create,
+            ioctl::AUTH | ioctl::RENDER_ALLOW, file::File::vm_create),
+        (ASAHI_VM_DESTROY,      drm_asahi_vm_destroy,
+            ioctl::AUTH | ioctl::RENDER_ALLOW, file::File::vm_destroy),
+        (ASAHI_GEM_CREATE,      drm_asahi_gem_create,
+            ioctl::AUTH | ioctl::RENDER_ALLOW, file::File::gem_create),
+        (ASAHI_GEM_MMAP_OFFSET, drm_asahi_gem_mmap_offset,
+            ioctl::AUTH | ioctl::RENDER_ALLOW, file::File::gem_mmap_offset),
+        (ASAHI_GEM_BIND,        drm_asahi_gem_bind,
+            ioctl::AUTH | ioctl::RENDER_ALLOW, file::File::gem_bind),
+        (ASAHI_QUEUE_CREATE,    drm_asahi_queue_create,
+            ioctl::AUTH | ioctl::RENDER_ALLOW, file::File::queue_create),
+        (ASAHI_QUEUE_DESTROY,   drm_asahi_queue_destroy,
+            ioctl::AUTH | ioctl::RENDER_ALLOW, file::File::queue_destroy),
+        (ASAHI_SUBMIT,          drm_asahi_submit,
+            ioctl::AUTH | ioctl::RENDER_ALLOW, file::File::submit),
+    }
+}
+
+// OF Device ID table.
+kernel::define_of_id_table! {ASAHI_ID_TABLE, &'static hw::HwConfig, [
+    (of::DeviceId::Compatible(b"apple,agx-t8103"), Some(&hw::t8103::HWCONFIG)),
+    (of::DeviceId::Compatible(b"apple,agx-t8112"), Some(&hw::t8112::HWCONFIG)),
+    (of::DeviceId::Compatible(b"apple,agx-t6000"), Some(&hw::t600x::HWCONFIG_T6000)),
+    (of::DeviceId::Compatible(b"apple,agx-t6001"), Some(&hw::t600x::HWCONFIG_T6001)),
+    (of::DeviceId::Compatible(b"apple,agx-t6002"), Some(&hw::t600x::HWCONFIG_T6002)),
+]}
+
+/// Platform Driver implementation for `AsahiDriver`.
+impl platform::Driver for AsahiDriver {
+    /// Our `DeviceData` type, reference-counted
+    type Data = Arc<DeviceData>;
+    /// Data associated with each hardware ID.
+    type IdInfo = &'static hw::HwConfig;
+
+    // Assign the above OF ID table to this driver.
+    kernel::driver_of_id_table!(ASAHI_ID_TABLE);
+
+    /// Device probe function.
+    fn probe(
+        pdev: &mut platform::Device,
+        id_info: Option<&Self::IdInfo>,
+    ) -> Result<Arc<DeviceData>> {
+        debug::update_debug_flags();
+
+        let dev = device::Device::from_dev(pdev);
+
+        dev_info!(dev, "Probing...\n");
+
+        let cfg = id_info.ok_or(ENODEV)?;
+
+        pdev.set_dma_masks((1 << cfg.uat_oas) - 1)?;
+
+        let res = regs::Resources::new(pdev)?;
+
+        // Initialize misc MMIO
+        res.init_mmio()?;
+
+        // Start the coprocessor CPU, so UAT can initialize the handoff
+        res.start_cpu()?;
+
+        let node = dev.of_node().ok_or(EIO)?;
+        let compat: Vec<u32> = node.get_property(c_str!("apple,firmware-compat"))?;
+
+        let reg = drm::drv::Registration::<AsahiDriver>::new(&dev)?;
+        let gpu = match (cfg.gpu_gen, compat.as_slice()) {
+            (hw::GpuGen::G13, &[12, 3, 0]) => {
+                gpu::GpuManagerG13V12_3::new(reg.device(), &res, cfg)? as Arc<dyn gpu::GpuManager>
+            }
+            (hw::GpuGen::G13, &[13, 2, 0]) => {
+                gpu::GpuManagerG13V13_2::new(reg.device(), &res, cfg)? as Arc<dyn gpu::GpuManager>
+            }
+            (hw::GpuGen::G14, &[12, 4, 0]) => {
+                gpu::GpuManagerG14V12_4::new(reg.device(), &res, cfg)? as Arc<dyn gpu::GpuManager>
+            }
+            (hw::GpuGen::G14, &[13, 2, 0]) => {
+                gpu::GpuManagerG14V13_2::new(reg.device(), &res, cfg)? as Arc<dyn gpu::GpuManager>
+            }
+            _ => {
+                dev_info!(
+                    dev,
+                    "Unsupported GPU/firmware combination ({:?}, {:?})\n",
+                    cfg.gpu_gen,
+                    compat
+                );
+                return Err(ENODEV);
+            }
+        };
+
+        let data =
+            kernel::new_device_data!(reg, res, AsahiData { dev, gpu }, "Asahi::Registrations")?;
+
+        let data = Arc::<DeviceData>::from(data);
+
+        data.gpu.init()?;
+
+        kernel::drm_device_register!(
+            data.registrations().ok_or(ENXIO)?.as_pinned_mut(),
+            data.clone(),
+            0
+        )?;
+
+        dev_info!(data.dev, "Probed!\n");
+        Ok(data)
+    }
+}
+
+// Export the OF ID table as a module ID table, to make modpost/autoloading work.
+kernel::module_of_id_table!(MOD_TABLE, ASAHI_ID_TABLE);
diff --git a/drivers/gpu/drm/asahi/event.rs b/drivers/gpu/drm/asahi/event.rs
new file mode 100644
index 000000000000..ccf00e4104be
--- /dev/null
+++ b/drivers/gpu/drm/asahi/event.rs
@@ -0,0 +1,229 @@
+// SPDX-License-Identifier: GPL-2.0-only OR MIT
+
+//! GPU event manager
+//!
+//! The GPU firmware manages work completion by using event objects (Apple calls them "stamps"),
+//! which are monotonically incrementing counters. There are a fixed number of objects, and
+//! they are managed with a `SlotAllocator`.
+//!
+//! This module manages the set of available events and lets users compute expected values.
+//! It also manages signaling owners when the GPU firmware reports that an event fired.
+
+use crate::debug::*;
+use crate::fw::types::*;
+use crate::{gpu, slotalloc, workqueue};
+use core::cmp;
+use core::sync::atomic::Ordering;
+use kernel::prelude::*;
+use kernel::sync::Arc;
+
+const DEBUG_CLASS: DebugFlags = DebugFlags::Event;
+
+/// Number of events managed by the firmware.
+const NUM_EVENTS: u32 = 128;
+
+/// Inner data associated with a given event slot.
+pub(crate) struct EventInner {
+    /// CPU pointer to the driver notification event stamp
+    stamp: *const AtomicU32,
+    /// GPU pointer to the driver notification event stamp
+    gpu_stamp: GpuWeakPointer<Stamp>,
+    /// GPU pointer to the firmware-internal event stamp
+    gpu_fw_stamp: GpuWeakPointer<FwStamp>,
+}
+
+/// SAFETY: The event slots are safe to send across threads.
+unsafe impl Send for EventInner {}
+
+/// Alias for an event token, which allows requesting the same event.
+pub(crate) type Token = slotalloc::SlotToken;
+/// Alias for an allocated `Event` that has a slot.
+pub(crate) type Event = slotalloc::Guard<EventInner>;
+
+/// Represents a given stamp value for an event.
+#[derive(Eq, PartialEq, Copy, Clone, Debug)]
+#[repr(transparent)]
+pub(crate) struct EventValue(u32);
+
+impl EventValue {
+    /// Returns the `EventValue` that succeeds this one.
+    pub(crate) fn next(&self) -> EventValue {
+        EventValue(self.0.wrapping_add(0x100))
+    }
+
+    /// Increments this `EventValue` in place.
+    pub(crate) fn increment(&mut self) {
+        self.0 = self.0.wrapping_add(0x100);
+    }
+
+    /* Not used
+    /// Increments this `EventValue` in place by a certain count.
+    pub(crate) fn add(&mut self, val: u32) {
+        self.0 = self
+            .0
+            .wrapping_add(val.checked_mul(0x100).expect("Adding too many events"));
+    }
+    */
+
+    /// Increments this `EventValue` in place by a certain count.
+    pub(crate) fn sub(&mut self, val: u32) {
+        self.0 = self
+            .0
+            .wrapping_sub(val.checked_mul(0x100).expect("Subtracting too many events"));
+    }
+
+    /// Computes the delta between this event and another event.
+    pub(crate) fn delta(&self, other: &EventValue) -> i32 {
+        (self.0.wrapping_sub(other.0) as i32) >> 8
+    }
+}
+
+impl PartialOrd for EventValue {
+    fn partial_cmp(&self, other: &Self) -> Option<cmp::Ordering> {
+        Some(self.cmp(other))
+    }
+}
+
+impl Ord for EventValue {
+    fn cmp(&self, other: &Self) -> cmp::Ordering {
+        self.delta(other).cmp(&0)
+    }
+}
+
+impl EventInner {
+    /// Returns the GPU pointer to the driver notification stamp
+    pub(crate) fn stamp_pointer(&self) -> GpuWeakPointer<Stamp> {
+        self.gpu_stamp
+    }
+
+    /// Returns the GPU pointer to the firmware internal stamp
+    pub(crate) fn fw_stamp_pointer(&self) -> GpuWeakPointer<FwStamp> {
+        self.gpu_fw_stamp
+    }
+
+    /// Fetches the current event value from shared memory
+    pub(crate) fn current(&self) -> EventValue {
+        // SAFETY: The pointer is always valid as constructed in
+        // EventManager below, and outside users cannot construct
+        // new EventInners, nor move or copy them, and Guards as
+        // returned by the SlotAllocator hold a reference to the
+        // SlotAllocator containing the EventManagerInner, which
+        // keeps the GpuObject the stamp is contained within alive.
+        EventValue(unsafe { &*self.stamp }.load(Ordering::Acquire))
+    }
+}
+
+impl slotalloc::SlotItem for EventInner {
+    type Data = EventManagerInner;
+
+    fn release(&mut self, data: &mut Self::Data, slot: u32) {
+        mod_pr_debug!("EventManager: Released slot {}\n", slot);
+        data.owners[slot as usize] = None;
+    }
+}
+
+/// Inner data for the event manager, to be protected by the SlotAllocator lock.
+pub(crate) struct EventManagerInner {
+    stamps: GpuArray<Stamp>,
+    fw_stamps: GpuArray<FwStamp>,
+    // Note: Use dyn to avoid having to version this entire module.
+    owners: Vec<Option<Arc<dyn workqueue::WorkQueue + Send + Sync>>>,
+}
+
+/// Top-level EventManager object.
+pub(crate) struct EventManager {
+    alloc: slotalloc::SlotAllocator<EventInner>,
+}
+
+impl EventManager {
+    /// Create a new EventManager.
+    #[inline(never)]
+    pub(crate) fn new(alloc: &mut gpu::KernelAllocators) -> Result<EventManager> {
+        let mut owners = Vec::new();
+        for _i in 0..(NUM_EVENTS as usize) {
+            owners.try_push(None)?;
+        }
+        let inner = EventManagerInner {
+            stamps: alloc.shared.array_empty(NUM_EVENTS as usize)?,
+            fw_stamps: alloc.private.array_empty(NUM_EVENTS as usize)?,
+            owners,
+        };
+
+        Ok(EventManager {
+            alloc: slotalloc::SlotAllocator::new(
+                NUM_EVENTS,
+                inner,
+                |inner: &mut EventManagerInner, slot| EventInner {
+                    stamp: &inner.stamps[slot as usize].0,
+                    gpu_stamp: inner.stamps.weak_item_pointer(slot as usize),
+                    gpu_fw_stamp: inner.fw_stamps.weak_item_pointer(slot as usize),
+                },
+            )?,
+        })
+    }
+
+    /// Gets a free `Event`, optionally trying to reuse the last one allocated by this caller.
+    pub(crate) fn get(
+        &self,
+        token: Option<Token>,
+        owner: Arc<dyn workqueue::WorkQueue + Send + Sync>,
+    ) -> Result<Event> {
+        let ev = self.alloc.get_inner(token, |inner, ev| {
+            mod_pr_debug!(
+                "EventManager: Registered owner {:p} on slot {}\n",
+                &*owner,
+                ev.slot()
+            );
+            inner.owners[ev.slot() as usize] = Some(owner);
+            Ok(())
+        })?;
+        Ok(ev)
+    }
+
+    /// Signals an event by slot, indicating completion (of one or more commands).
+    pub(crate) fn signal(&self, slot: u32) {
+        match self
+            .alloc
+            .with_inner(|inner| inner.owners[slot as usize].as_ref().cloned())
+        {
+            Some(owner) => {
+                owner.signal();
+            }
+            None => {
+                mod_pr_debug!("EventManager: Received event for empty slot {}\n", slot);
+            }
+        }
+    }
+
+    /// Marks the owner of an event as having lost its work due to a GPU error.
+    pub(crate) fn mark_error(&self, slot: u32, wait_value: u32, error: workqueue::WorkError) {
+        match self
+            .alloc
+            .with_inner(|inner| inner.owners[slot as usize].as_ref().cloned())
+        {
+            Some(owner) => {
+                owner.mark_error(EventValue(wait_value), error);
+            }
+            None => {
+                pr_err!("Received error for empty slot {}\n", slot);
+            }
+        }
+    }
+
+    /// Fail all commands, used when the GPU crashes.
+    pub(crate) fn fail_all(&self, error: workqueue::WorkError) {
+        let mut owners: Vec<Arc<dyn workqueue::WorkQueue + Send + Sync>> = Vec::new();
+
+        self.alloc.with_inner(|inner| {
+            for wq in inner.owners.iter().filter_map(|o| o.as_ref()).cloned() {
+                if owners.try_push(wq).is_err() {
+                    pr_err!("Failed to signal failure to WorkQueue\n");
+                }
+            }
+        });
+
+        for wq in owners {
+            wq.fail_all(error);
+        }
+    }
+}
diff --git a/drivers/gpu/drm/asahi/file.rs b/drivers/gpu/drm/asahi/file.rs
new file mode 100644
index 000000000000..5d47feb30134
--- /dev/null
+++ b/drivers/gpu/drm/asahi/file.rs
@@ -0,0 +1,718 @@
+// SPDX-License-Identifier: GPL-2.0-only OR MIT
+#![allow(clippy::unusual_byte_groupings)]
+
+//! File implementation, which represents a single DRM client.
+//!
+//! This is in charge of managing the resources associated with one GPU client, including an
+//! arbitrary number of submission queues and Vm objects, and reporting hardware/driver
+//! information to userspace and accepting submissions.
+
+use crate::debug::*;
+use crate::driver::AsahiDevice;
+use crate::{alloc, buffer, driver, gem, mmu, queue};
+use core::mem::MaybeUninit;
+use kernel::dma_fence::RawDmaFence;
+use kernel::drm::gem::BaseObject;
+use kernel::io_buffer::{IoBufferReader, IoBufferWriter};
+use kernel::prelude::*;
+use kernel::sync::{smutex::Mutex, Arc};
+use kernel::user_ptr::UserSlicePtr;
+use kernel::{bindings, dma_fence, drm, xarray};
+
+const DEBUG_CLASS: DebugFlags = DebugFlags::File;
+
+const MAX_SYNCS_PER_SUBMISSION: u32 = 64;
+const MAX_COMMANDS_PER_SUBMISSION: u32 = 64;
+pub(crate) const MAX_COMMANDS_IN_FLIGHT: u32 = 1024;
+
+/// A client instance of an `mmu::Vm` address space.
+struct Vm {
+    ualloc: Arc<Mutex<alloc::DefaultAllocator>>,
+    ualloc_priv: Arc<Mutex<alloc::DefaultAllocator>>,
+    vm: mmu::Vm,
+    dummy_obj: gem::ObjectRef,
+}
+
+impl Drop for Vm {
+    fn drop(&mut self) {
+        // Mappings create a reference loop, make sure to break it.
+        self.dummy_obj.drop_vm_mappings(self.vm.id());
+    }
+}
+
+/// Sync object from userspace.
+pub(crate) struct SyncItem {
+    pub(crate) syncobj: drm::syncobj::SyncObj,
+    pub(crate) fence: Option<dma_fence::Fence>,
+    pub(crate) chain_fence: Option<dma_fence::FenceChain>,
+    pub(crate) timeline_value: u64,
+}
+
+impl SyncItem {
+    fn parse_one(file: &DrmFile, data: bindings::drm_asahi_sync, out: bool) -> Result<SyncItem> {
+        if data.extensions != 0 {
+            return Err(EINVAL);
+        }
+
+        match data.sync_type {
+            bindings::drm_asahi_sync_type_DRM_ASAHI_SYNC_SYNCOBJ => {
+                if data.timeline_value != 0 {
+                    return Err(EINVAL);
+                }
+                let syncobj = drm::syncobj::SyncObj::lookup_handle(file, data.handle)?;
+
+                Ok(SyncItem {
+                    fence: if out {
+                        None
+                    } else {
+                        Some(syncobj.fence_get().ok_or(EINVAL)?)
+                    },
+                    syncobj,
+                    chain_fence: None,
+                    timeline_value: data.timeline_value,
+                })
+            }
+            bindings::drm_asahi_sync_type_DRM_ASAHI_SYNC_TIMELINE_SYNCOBJ => {
+                let syncobj = drm::syncobj::SyncObj::lookup_handle(file, data.handle)?;
+                let fence = if out {
+                    None
+                } else {
+                    Some(
+                        syncobj
+                            .fence_get()
+                            .ok_or(EINVAL)?
+                            .chain_find_seqno(data.timeline_value)?,
+                    )
+                };
+
+                Ok(SyncItem {
+                    fence,
+                    syncobj,
+                    chain_fence: if out {
+                        Some(dma_fence::FenceChain::new()?)
+                    } else {
+                        None
+                    },
+                    timeline_value: data.timeline_value,
+                })
+            }
+            _ => Err(EINVAL),
+        }
+    }
+
+    fn parse_array(file: &DrmFile, ptr: u64, count: u32, out: bool) -> Result<Vec<SyncItem>> {
+        let mut vec = Vec::try_with_capacity(count as usize)?;
+
+        const STRIDE: usize = core::mem::size_of::<bindings::drm_asahi_sync>();
+        let size = STRIDE * count as usize;
+
+        // SAFETY: We only read this once, so there are no TOCTOU issues.
+        let mut reader = unsafe { UserSlicePtr::new(ptr as usize as *mut _, size).reader() };
+
+        for _i in 0..count {
+            let mut sync: MaybeUninit<bindings::drm_asahi_sync> = MaybeUninit::uninit();
+
+            // SAFETY: The size of `sync` is STRIDE
+            unsafe { reader.read_raw(sync.as_mut_ptr() as *mut u8, STRIDE)? };
+
+            // SAFETY: All bit patterns in the struct are valid
+            let sync = unsafe { sync.assume_init() };
+
+            vec.try_push(SyncItem::parse_one(file, sync, out)?)?;
+        }
+
+        Ok(vec)
+    }
+}
+
+/// State associated with a client.
+pub(crate) struct File {
+    id: u64,
+    vms: xarray::XArray<Box<Vm>>,
+    queues: xarray::XArray<Arc<Mutex<Box<dyn queue::Queue>>>>,
+}
+
+/// Convenience type alias for our DRM `File` type.
+pub(crate) type DrmFile = drm::file::File<File>;
+
+/// Start address of the 32-bit USC address space.
+const VM_SHADER_START: u64 = 0x11_00000000;
+/// End address of the 32-bit USC address space.
+const VM_SHADER_END: u64 = 0x11_ffffffff;
+/// Start address of the general user mapping region.
+const VM_USER_START: u64 = 0x20_00000000;
+/// End address of the general user mapping region.
+const VM_USER_END: u64 = 0x5f_ffffffff;
+
+/// Start address of the kernel-managed GPU-only mapping region.
+const VM_DRV_GPU_START: u64 = 0x60_00000000;
+/// End address of the kernel-managed GPU-only mapping region.
+const VM_DRV_GPU_END: u64 = 0x60_ffffffff;
+/// Start address of the kernel-managed GPU/FW shared mapping region.
+const VM_DRV_GPUFW_START: u64 = 0x61_00000000;
+/// End address of the kernel-managed GPU/FW shared mapping region.
+const VM_DRV_GPUFW_END: u64 = 0x61_ffffffff;
+/// Address of a special dummy page?
+const VM_UNK_PAGE: u64 = 0x6f_ffff8000;
+
+impl drm::file::DriverFile for File {
+    type Driver = driver::AsahiDriver;
+
+    /// Create a new `File` instance for a fresh client.
+    fn open(device: &AsahiDevice) -> Result<Box<Self>> {
+        debug::update_debug_flags();
+
+        let gpu = &device.data().gpu;
+        let id = gpu.ids().file.next();
+
+        mod_dev_dbg!(device, "[File {}]: DRM device opened\n", id);
+        Ok(Box::try_new(Self {
+            id,
+            vms: xarray::XArray::new(xarray::flags::ALLOC1)?,
+            queues: xarray::XArray::new(xarray::flags::ALLOC1)?,
+        })?)
+    }
+}
+
+impl File {
+    /// IOCTL: get_param: Get a driver parameter value.
+    pub(crate) fn get_params(
+        device: &AsahiDevice,
+        data: &mut bindings::drm_asahi_get_params,
+        file: &DrmFile,
+    ) -> Result<u32> {
+        mod_dev_dbg!(device, "[File {}]: IOCTL: get_params\n", file.id);
+
+        let gpu = &device.data().gpu;
+
+        if data.extensions != 0 || data.param_group != 0 || data.pad != 0 {
+            return Err(EINVAL);
+        }
+
+        let mut params = bindings::drm_asahi_params_global {
+            unstable_uabi_version: bindings::DRM_ASAHI_UNSTABLE_UABI_VERSION,
+            pad0: 0,
+
+            feat_compat: gpu.get_cfg().gpu_feat_compat,
+            feat_incompat: gpu.get_cfg().gpu_feat_incompat,
+
+            gpu_generation: gpu.get_dyncfg().id.gpu_gen as u32,
+            gpu_variant: gpu.get_dyncfg().id.gpu_variant as u32,
+            gpu_revision: gpu.get_dyncfg().id.gpu_rev as u32,
+            chip_id: gpu.get_cfg().chip_id,
+
+            num_dies: gpu.get_dyncfg().id.max_dies,
+            num_clusters_total: gpu.get_dyncfg().id.num_clusters,
+            num_cores_per_cluster: gpu.get_dyncfg().id.num_cores,
+            num_frags_per_cluster: gpu.get_dyncfg().id.num_frags,
+            num_gps_per_cluster: gpu.get_dyncfg().id.num_gps,
+            num_cores_total_active: gpu.get_dyncfg().id.total_active_cores,
+            core_masks: [0; bindings::DRM_ASAHI_MAX_CLUSTERS as usize],
+
+            vm_page_size: mmu::UAT_PGSZ as u32,
+            pad1: 0,
+            vm_user_start: VM_USER_START,
+            vm_user_end: VM_USER_END,
+            vm_shader_start: VM_SHADER_START,
+            vm_shader_end: VM_SHADER_END,
+
+            max_syncs_per_submission: MAX_SYNCS_PER_SUBMISSION,
+            max_commands_per_submission: MAX_COMMANDS_PER_SUBMISSION,
+            max_commands_in_flight: MAX_COMMANDS_IN_FLIGHT,
+            max_attachments: crate::microseq::MAX_ATTACHMENTS as u32,
+
+            timer_frequency_hz: gpu.get_cfg().base_clock_hz,
+            min_frequency_khz: gpu.get_dyncfg().pwr.min_frequency_khz(),
+            max_frequency_khz: gpu.get_dyncfg().pwr.max_frequency_khz(),
+            max_power_mw: gpu.get_dyncfg().pwr.max_power_mw,
+
+            result_render_size: core::mem::size_of::<bindings::drm_asahi_result_render>() as u32,
+            result_compute_size: core::mem::size_of::<bindings::drm_asahi_result_compute>() as u32,
+        };
+
+        for (i, mask) in gpu.get_dyncfg().id.core_masks.iter().enumerate() {
+            *(params.core_masks.get_mut(i).ok_or(EIO)?) = (*mask).try_into()?;
+        }
+
+        let size =
+            core::mem::size_of::<bindings::drm_asahi_params_global>().min(data.size.try_into()?);
+
+        // SAFETY: We only write to this userptr once, so there are no TOCTOU issues.
+        let mut params_writer =
+            unsafe { UserSlicePtr::new(data.pointer as usize as *mut _, size).writer() };
+
+        // SAFETY: `size` is at most the sizeof of `params`
+        unsafe { params_writer.write_raw(&params as *const _ as *const u8, size)? };
+
+        Ok(0)
+    }
+
+    /// IOCTL: vm_create: Create a new `Vm`.
+    pub(crate) fn vm_create(
+        device: &AsahiDevice,
+        data: &mut bindings::drm_asahi_vm_create,
+        file: &DrmFile,
+    ) -> Result<u32> {
+        if data.extensions != 0 {
+            return Err(EINVAL);
+        }
+
+        let gpu = &device.data().gpu;
+        let file_id = file.id;
+        let vm = gpu.new_vm(file_id)?;
+
+        let resv = file.vms.reserve()?;
+        let id: u32 = resv.index().try_into()?;
+
+        mod_dev_dbg!(device, "[File {} VM {}]: VM Create\n", file_id, id);
+        mod_dev_dbg!(
+            device,
+            "[File {} VM {}]: Creating allocators\n",
+            file_id,
+            id
+        );
+        let ualloc = Arc::try_new(Mutex::new(alloc::DefaultAllocator::new(
+            device,
+            &vm,
+            VM_DRV_GPU_START,
+            VM_DRV_GPU_END,
+            buffer::PAGE_SIZE,
+            mmu::PROT_GPU_SHARED_RW,
+            512 * 1024,
+            true,
+            fmt!("File {} VM {} GPU Shared", file_id, id),
+            false,
+        )?))?;
+        let ualloc_priv = Arc::try_new(Mutex::new(alloc::DefaultAllocator::new(
+            device,
+            &vm,
+            VM_DRV_GPUFW_START,
+            VM_DRV_GPUFW_END,
+            buffer::PAGE_SIZE,
+            mmu::PROT_GPU_FW_PRIV_RW,
+            64 * 1024,
+            true,
+            fmt!("File {} VM {} GPU FW Private", file_id, id),
+            false,
+        )?))?;
+
+        mod_dev_dbg!(
+            device,
+            "[File {} VM {}]: Creating dummy object\n",
+            file_id,
+            id
+        );
+        let mut dummy_obj = gem::new_kernel_object(device, 0x4000)?;
+        dummy_obj.vmap()?.as_mut_slice().fill(0);
+        dummy_obj.map_at(&vm, VM_UNK_PAGE, mmu::PROT_GPU_SHARED_RW, true)?;
+
+        mod_dev_dbg!(device, "[File {} VM {}]: VM created\n", file_id, id);
+        resv.store(Box::try_new(Vm {
+            ualloc,
+            ualloc_priv,
+            vm,
+            dummy_obj,
+        })?)?;
+
+        data.vm_id = id;
+
+        Ok(0)
+    }
+
+    /// IOCTL: vm_destroy: Destroy a `Vm`.
+    pub(crate) fn vm_destroy(
+        _device: &AsahiDevice,
+        data: &mut bindings::drm_asahi_vm_destroy,
+        file: &DrmFile,
+    ) -> Result<u32> {
+        if data.extensions != 0 {
+            return Err(EINVAL);
+        }
+
+        if file.vms.remove(data.vm_id as usize).is_none() {
+            Err(ENOENT)
+        } else {
+            Ok(0)
+        }
+    }
+
+    /// IOCTL: gem_create: Create a new GEM object.
+    pub(crate) fn gem_create(
+        device: &AsahiDevice,
+        data: &mut bindings::drm_asahi_gem_create,
+        file: &DrmFile,
+    ) -> Result<u32> {
+        mod_dev_dbg!(
+            device,
+            "[File {}]: IOCTL: gem_create size={:#x?}\n",
+            file.id,
+            data.size
+        );
+
+        if data.extensions != 0
+            || (data.flags & !(bindings::ASAHI_GEM_WRITEBACK | bindings::ASAHI_GEM_VM_PRIVATE)) != 0
+            || (data.flags & bindings::ASAHI_GEM_VM_PRIVATE == 0 && data.vm_id != 0)
+        {
+            return Err(EINVAL);
+        }
+
+        let vm_id = if data.flags & bindings::ASAHI_GEM_VM_PRIVATE != 0 {
+            Some(file.vms.get(data.vm_id.try_into()?).ok_or(ENOENT)?.vm.id())
+        } else {
+            None
+        };
+
+        let bo = gem::new_object(device, data.size.try_into()?, data.flags, vm_id)?;
+
+        let handle = bo.gem.create_handle(file)?;
+        data.handle = handle;
+
+        mod_dev_dbg!(
+            device,
+            "[File {}]: IOCTL: gem_create size={:#x} handle={:#x?}\n",
+            file.id,
+            data.size,
+            data.handle
+        );
+
+        Ok(0)
+    }
+
+    /// IOCTL: gem_mmap_offset: Assign an mmap offset to a GEM object.
+    pub(crate) fn gem_mmap_offset(
+        device: &AsahiDevice,
+        data: &mut bindings::drm_asahi_gem_mmap_offset,
+        file: &DrmFile,
+    ) -> Result<u32> {
+        mod_dev_dbg!(
+            device,
+            "[File {}]: IOCTL: gem_mmap_offset handle={:#x?}\n",
+            file.id,
+            data.handle
+        );
+
+        if data.extensions != 0 || data.flags != 0 {
+            return Err(EINVAL);
+        }
+
+        let bo = gem::lookup_handle(file, data.handle)?;
+        data.offset = bo.gem.create_mmap_offset()?;
+        Ok(0)
+    }
+
+    /// IOCTL: gem_bind: Map or unmap a GEM object into a Vm.
+    pub(crate) fn gem_bind(
+        device: &AsahiDevice,
+        data: &mut bindings::drm_asahi_gem_bind,
+        file: &DrmFile,
+    ) -> Result<u32> {
+        mod_dev_dbg!(
+            device,
+            "[File {} VM {}]: IOCTL: gem_bind op={:?} handle={:#x?} flags={:#x?} {:#x?}:{:#x?} -> {:#x?}\n",
+            file.id,
+            data.op,
+            data.vm_id,
+            data.handle,
+            data.flags,
+            data.offset,
+            data.range,
+            data.addr
+        );
+
+        if data.extensions != 0 {
+            return Err(EINVAL);
+        }
+
+        match data.op {
+            bindings::drm_asahi_bind_op_ASAHI_BIND_OP_BIND => Self::do_gem_bind(device, data, file),
+            bindings::drm_asahi_bind_op_ASAHI_BIND_OP_UNBIND => Err(ENOTSUPP),
+            bindings::drm_asahi_bind_op_ASAHI_BIND_OP_UNBIND_ALL => {
+                Self::do_gem_unbind_all(device, data, file)
+            }
+            _ => Err(EINVAL),
+        }
+    }
+
+    pub(crate) fn do_gem_bind(
+        _device: &AsahiDevice,
+        data: &mut bindings::drm_asahi_gem_bind,
+        file: &DrmFile,
+    ) -> Result<u32> {
+        if data.offset != 0 {
+            return Err(EINVAL); // Not supported yet
+        }
+
+        if (data.addr | data.range) as usize & mmu::UAT_PGMSK != 0 {
+            return Err(EINVAL); // Must be page aligned
+        }
+
+        if (data.flags & !(bindings::ASAHI_BIND_READ | bindings::ASAHI_BIND_WRITE)) != 0 {
+            return Err(EINVAL);
+        }
+
+        let mut bo = gem::lookup_handle(file, data.handle)?;
+
+        if data.range != bo.size().try_into()? {
+            return Err(EINVAL); // Not supported yet
+        }
+
+        let start = data.addr;
+        let end = data.addr + data.range - 1;
+
+        if (VM_SHADER_START..=VM_SHADER_END).contains(&start) {
+            if !(VM_SHADER_START..=VM_SHADER_END).contains(&end) {
+                return Err(EINVAL); // Invalid map range
+            }
+        } else if (VM_USER_START..=VM_USER_END).contains(&start) {
+            if !(VM_USER_START..=VM_USER_END).contains(&end) {
+                return Err(EINVAL); // Invalid map range
+            }
+        } else {
+            return Err(EINVAL); // Invalid map range
+        }
+
+        // Just in case
+        if end >= VM_DRV_GPU_START {
+            return Err(EINVAL);
+        }
+
+        let prot = if data.flags & bindings::ASAHI_BIND_READ != 0 {
+            if data.flags & bindings::ASAHI_BIND_WRITE != 0 {
+                mmu::PROT_GPU_SHARED_RW
+            } else {
+                mmu::PROT_GPU_SHARED_RO
+            }
+        } else if data.flags & bindings::ASAHI_BIND_WRITE != 0 {
+            mmu::PROT_GPU_SHARED_WO
+        } else {
+            return Err(EINVAL); // Must specify one of ASAHI_BIND_{READ,WRITE}
+        };
+
+        // Clone it immediately so we aren't holding the XArray lock
+        let vm = file
+            .vms
+            .get(data.vm_id.try_into()?)
+            .ok_or(ENOENT)?
+            .vm
+            .clone();
+
+        bo.map_at(&vm, start, prot, true)?;
+
+        Ok(0)
+    }
+
+    pub(crate) fn do_gem_unbind_all(
+        _device: &AsahiDevice,
+        data: &mut bindings::drm_asahi_gem_bind,
+        file: &DrmFile,
+    ) -> Result<u32> {
+        if data.flags != 0 || data.offset != 0 || data.range != 0 || data.addr != 0 {
+            return Err(EINVAL);
+        }
+
+        let mut bo = gem::lookup_handle(file, data.handle)?;
+
+        if data.vm_id == 0 {
+            bo.drop_file_mappings(file.id);
+        } else {
+            let vm_id = file.vms.get(data.vm_id.try_into()?).ok_or(ENOENT)?.vm.id();
+            bo.drop_vm_mappings(vm_id);
+        }
+
+        Ok(0)
+    }
+
+    /// IOCTL: queue_create: Create a new command submission queue of a given type.
+    pub(crate) fn queue_create(
+        device: &AsahiDevice,
+        data: &mut bindings::drm_asahi_queue_create,
+        file: &DrmFile,
+    ) -> Result<u32> {
+        let file_id = file.id;
+
+        mod_dev_dbg!(
+            device,
+            "[File {} VM {}]: Creating queue caps={:?} prio={:?} flags={:#x?}\n",
+            file_id,
+            data.vm_id,
+            data.queue_caps,
+            data.priority,
+            data.flags,
+        );
+
+        if data.extensions != 0
+            || data.flags != 0
+            || data.priority > 3
+            || data.queue_caps == 0
+            || (data.queue_caps
+                & !(bindings::drm_asahi_queue_cap_DRM_ASAHI_QUEUE_CAP_RENDER
+                    | bindings::drm_asahi_queue_cap_DRM_ASAHI_QUEUE_CAP_BLIT
+                    | bindings::drm_asahi_queue_cap_DRM_ASAHI_QUEUE_CAP_COMPUTE))
+                != 0
+        {
+            return Err(EINVAL);
+        }
+
+        let resv = file.queues.reserve()?;
+        let file_vm = file.vms.get(data.vm_id.try_into()?).ok_or(ENOENT)?;
+        let vm = file_vm.vm.clone();
+        let ualloc = file_vm.ualloc.clone();
+        let ualloc_priv = file_vm.ualloc_priv.clone();
+        // Drop the vms lock eagerly
+        core::mem::drop(file_vm);
+
+        let queue =
+            device
+                .data()
+                .gpu
+                .new_queue(vm, ualloc, ualloc_priv, data.priority, data.queue_caps)?;
+
+        data.queue_id = resv.index().try_into()?;
+        resv.store(Arc::try_new(Mutex::new(queue))?)?;
+
+        Ok(0)
+    }
+
+    /// IOCTL: queue_destroy: Destroy a command submission queue.
+    pub(crate) fn queue_destroy(
+        _device: &AsahiDevice,
+        data: &mut bindings::drm_asahi_queue_destroy,
+        file: &DrmFile,
+    ) -> Result<u32> {
+        if data.extensions != 0 {
+            return Err(EINVAL);
+        }
+
+        if file.queues.remove(data.queue_id as usize).is_none() {
+            Err(ENOENT)
+        } else {
+            Ok(0)
+        }
+    }
+
+    /// IOCTL: submit: Submit GPU work to a command submission queue.
+    pub(crate) fn submit(
+        device: &AsahiDevice,
+        data: &mut bindings::drm_asahi_submit,
+        file: &DrmFile,
+    ) -> Result<u32> {
+        if data.extensions != 0
+            || data.flags != 0
+            || data.in_sync_count > MAX_SYNCS_PER_SUBMISSION
+            || data.out_sync_count > MAX_SYNCS_PER_SUBMISSION
+            || data.command_count > MAX_COMMANDS_PER_SUBMISSION
+        {
+            return Err(EINVAL);
+        }
+
+        debug::update_debug_flags();
+
+        let gpu = &device.data().gpu;
+        gpu.update_globals();
+
+        // Upgrade to Arc<T> to drop the XArray lock early
+        let queue: Arc<Mutex<Box<dyn queue::Queue>>> = file
+            .queues
+            .get(data.queue_id.try_into()?)
+            .ok_or(ENOENT)?
+            .borrow()
+            .into();
+
+        let id = gpu.ids().submission.next();
+        mod_dev_dbg!(
+            device,
+            "[File {} Queue {}]: IOCTL: submit (submission ID: {})\n",
+            file.id,
+            data.queue_id,
+            id
+        );
+
+        mod_dev_dbg!(
+            device,
+            "[File {} Queue {}]: IOCTL: submit({}): Parsing in_syncs\n",
+            file.id,
+            data.queue_id,
+            id
+        );
+        let in_syncs = SyncItem::parse_array(file, data.in_syncs, data.in_sync_count, false)?;
+        mod_dev_dbg!(
+            device,
+            "[File {} Queue {}]: IOCTL: submit({}): Parsing out_syncs\n",
+            file.id,
+            data.queue_id,
+            id
+        );
+        let out_syncs = SyncItem::parse_array(file, data.out_syncs, data.out_sync_count, true)?;
+
+        let result_buf = if data.result_handle != 0 {
+            mod_dev_dbg!(
+                device,
+                "[File {} Queue {}]: IOCTL: submit({}): Looking up result_handle {}\n",
+                file.id,
+                data.queue_id,
+                id,
+                data.result_handle
+            );
+            Some(gem::lookup_handle(file, data.result_handle)?)
+        } else {
+            None
+        };
+
+        mod_dev_dbg!(
+            device,
+            "[File {} Queue {}]: IOCTL: submit({}): Parsing commands\n",
+            file.id,
+            data.queue_id,
+            id
+        );
+        let mut commands = Vec::try_with_capacity(data.command_count as usize)?;
+
+        const STRIDE: usize = core::mem::size_of::<bindings::drm_asahi_command>();
+        let size = STRIDE * data.command_count as usize;
+
+        // SAFETY: We only read this once, so there are no TOCTOU issues.
+        let mut reader =
+            unsafe { UserSlicePtr::new(data.commands as usize as *mut _, size).reader() };
+
+        for _i in 0..data.command_count {
+            let mut cmd: MaybeUninit<bindings::drm_asahi_command> = MaybeUninit::uninit();
+
+            // SAFETY: The size of `sync` is STRIDE
+            unsafe { reader.read_raw(cmd.as_mut_ptr() as *mut u8, STRIDE)? };
+
+            // SAFETY: All bit patterns in the struct are valid
+            commands.try_push(unsafe { cmd.assume_init() })?;
+        }
+
+        let ret = queue
+            .lock()
+            .submit(id, in_syncs, out_syncs, result_buf, commands);
+
+        match ret {
+            Err(ERESTARTSYS) => Err(ERESTARTSYS),
+            Err(e) => {
+                dev_info!(
+                    device,
+                    "[File {} Queue {}]: IOCTL: submit failed! (submission ID: {} err: {:?})\n",
+                    file.id,
+                    data.queue_id,
+                    id,
+                    e
+                );
+                Err(e)
+            }
+            Ok(_) => Ok(0),
+        }
+    }
+
+    /// Returns the unique file ID for this `File`.
+    pub(crate) fn file_id(&self) -> u64 {
+        self.id
+    }
+}
+
+impl Drop for File {
+    fn drop(&mut self) {
+        mod_pr_debug!("[File {}]: Closing...\n", self.id);
+    }
+}
diff --git a/drivers/gpu/drm/asahi/float.rs b/drivers/gpu/drm/asahi/float.rs
new file mode 100644
index 000000000000..e73b4b628cf9
--- /dev/null
+++ b/drivers/gpu/drm/asahi/float.rs
@@ -0,0 +1,381 @@
+// SPDX-License-Identifier: GPL-2.0-only OR MIT
+
+//! Basic soft floating-point support
+//!
+//! The GPU firmware requires a large number of power-related configuration values, many of which
+//! are IEEE 754 32-bit floating point values. These values change not only between GPU/SoC
+//! variants, but also between specific hardware platforms using these SoCs, so they must be
+//! derived from device tree properties. There are many redundant values computed from the same
+//! inputs with simple add/sub/mul/div calculations, plus a few values that are actually specific
+//! to each individual device depending on its binning and fused voltage configuration, so it
+//! doesn't make sense to store the final values to be passed to the firmware in the device tree.
+//!
+//! Therefore, we need a way to perform floating-point calculations in the kernel.
+//!
+//! Using the actual FPU from kernel mode is asking for trouble, since there is no way to bound
+//! the execution of FPU instructions to a controlled section of code without outright putting it
+//! in its own compilation unit, which is quite painful for Rust. Since these calculations only
+//! have to happen at initialization time and there is no need for performance, let's use a simple
+//! software float implementation instead.
+//!
+//! This implementation makes no attempt to be fully IEEE754 compliant, but it's good enough and
+//! gives bit-identical results to macOS in the vast majority of cases, with one or two exceptions
+//! related to slightly non-compliant rounding.
+
+use core::ops;
+use kernel::{of, prelude::*};
+
+/// An IEEE754-compatible floating point number implemented in software.
+#[derive(Default, Debug, Copy, Clone)]
+pub(crate) struct F32(u32);
+
+#[derive(Default, Debug, Copy, Clone)]
+struct F32U {
+    sign: bool,
+    exp: i32,
+    frac: i64,
+}
+
+impl F32 {
+    /// Convert a raw 32-bit representation into an F32
+    pub(crate) const fn from_bits(u: u32) -> F32 {
+        F32(u)
+    }
+
+    // Convert a `f32` value into an F32
+    //
+    // This must ONLY be used in const context. Use the `f32!{}` macro to do it safely.
+    #[doc(hidden)]
+    pub(crate) const fn from_f32(v: f32) -> F32 {
+        F32(unsafe { core::mem::transmute(v) })
+    }
+
+    // Convert an F32 into a `f32` value
+    //
+    // For testing only.
+    #[doc(hidden)]
+    #[cfg(test)]
+    pub(crate) fn to_f32(self) -> f32 {
+        f32::from_bits(self.0)
+    }
+
+    const fn unpack(&self) -> F32U {
+        F32U {
+            sign: self.0 & (1 << 31) != 0,
+            exp: ((self.0 >> 23) & 0xff) as i32 - 127,
+            frac: (((self.0 & 0x7fffff) | 0x800000) as i64) << 9,
+        }
+        .norm()
+    }
+}
+
+/// Safely construct an `F32` out of a constant floating-point value.
+///
+/// This ensures that the conversion happens in const context, so no floating point operations are
+/// emitted.
+#[macro_export]
+macro_rules! f32 {
+    ([$($val:expr),*]) => {{
+        [$(f32!($val)),*]
+    }};
+    ($val:expr) => {{
+        const _K: $crate::float::F32 = $crate::float::F32::from_f32($val);
+        _K
+    }};
+}
+
+impl ops::Neg for F32 {
+    type Output = F32;
+
+    fn neg(self) -> F32 {
+        F32(self.0 ^ (1 << 31))
+    }
+}
+
+impl ops::Add<F32> for F32 {
+    type Output = F32;
+
+    fn add(self, rhs: F32) -> F32 {
+        self.unpack().add(rhs.unpack()).pack()
+    }
+}
+
+impl ops::Sub<F32> for F32 {
+    type Output = F32;
+
+    fn sub(self, rhs: F32) -> F32 {
+        self.unpack().add((-rhs).unpack()).pack()
+    }
+}
+
+impl ops::Mul<F32> for F32 {
+    type Output = F32;
+
+    fn mul(self, rhs: F32) -> F32 {
+        self.unpack().mul(rhs.unpack()).pack()
+    }
+}
+
+impl ops::Div<F32> for F32 {
+    type Output = F32;
+
+    fn div(self, rhs: F32) -> F32 {
+        self.unpack().div(rhs.unpack()).pack()
+    }
+}
+
+macro_rules! from_ints {
+    ($u:ty, $i:ty) => {
+        impl From<$i> for F32 {
+            fn from(v: $i) -> F32 {
+                F32U::from_i64(v as i64).pack()
+            }
+        }
+        impl From<$u> for F32 {
+            fn from(v: $u) -> F32 {
+                F32U::from_u64(v as u64).pack()
+            }
+        }
+    };
+}
+
+from_ints!(u8, i8);
+from_ints!(u16, i16);
+from_ints!(u32, i32);
+from_ints!(u64, i64);
+
+impl F32U {
+    const INFINITY: F32U = f32!(f32::INFINITY).unpack();
+    const NEG_INFINITY: F32U = f32!(f32::NEG_INFINITY).unpack();
+
+    fn from_i64(v: i64) -> F32U {
+        F32U {
+            sign: v < 0,
+            exp: 32,
+            frac: v.abs(),
+        }
+        .norm()
+    }
+
+    fn from_u64(mut v: u64) -> F32U {
+        let mut exp = 32;
+        if v >= (1 << 63) {
+            exp = 31;
+            v >>= 1;
+        }
+        F32U {
+            sign: false,
+            exp,
+            frac: v as i64,
+        }
+        .norm()
+    }
+
+    fn shr(&mut self, shift: i32) {
+        if shift > 63 {
+            self.exp = 0;
+            self.frac = 0;
+        } else {
+            self.frac >>= shift;
+        }
+    }
+
+    fn align(a: &mut F32U, b: &mut F32U) {
+        if a.exp > b.exp {
+            b.shr(a.exp - b.exp);
+            b.exp = a.exp;
+        } else {
+            a.shr(b.exp - a.exp);
+            a.exp = b.exp;
+        }
+    }
+
+    fn mul(self, other: F32U) -> F32U {
+        F32U {
+            sign: self.sign != other.sign,
+            exp: self.exp + other.exp,
+            frac: ((self.frac >> 8) * (other.frac >> 8)) >> 16,
+        }
+    }
+
+    fn div(self, other: F32U) -> F32U {
+        if other.frac == 0 || self.is_inf() {
+            if self.sign {
+                F32U::NEG_INFINITY
+            } else {
+                F32U::INFINITY
+            }
+        } else {
+            F32U {
+                sign: self.sign != other.sign,
+                exp: self.exp - other.exp,
+                frac: ((self.frac << 24) / (other.frac >> 8)),
+            }
+        }
+    }
+
+    fn add(mut self, mut other: F32U) -> F32U {
+        F32U::align(&mut self, &mut other);
+        if self.sign == other.sign {
+            self.frac += other.frac;
+        } else {
+            self.frac -= other.frac;
+        }
+        if self.frac < 0 {
+            self.sign = !self.sign;
+            self.frac = -self.frac;
+        }
+        self
+    }
+
+    const fn norm(mut self) -> F32U {
+        let lz = self.frac.leading_zeros() as i32;
+        if lz > 31 {
+            self.frac <<= lz - 31;
+            self.exp -= lz - 31;
+        } else if lz < 31 {
+            self.frac >>= 31 - lz;
+            self.exp += 31 - lz;
+        }
+
+        if self.is_zero() {
+            return F32U {
+                sign: self.sign,
+                frac: 0,
+                exp: 0,
+            };
+        }
+        self
+    }
+
+    const fn is_zero(&self) -> bool {
+        self.frac == 0 || self.exp < -126
+    }
+
+    const fn is_inf(&self) -> bool {
+        self.exp > 127
+    }
+
+    const fn pack(mut self) -> F32 {
+        self = self.norm();
+        if !self.is_zero() {
+            self.frac += 0x100;
+            self = self.norm();
+        }
+
+        if self.is_inf() {
+            if self.sign {
+                return f32!(f32::NEG_INFINITY);
+            } else {
+                return f32!(f32::INFINITY);
+            }
+        } else if self.is_zero() {
+            if self.sign {
+                return f32!(-0.0);
+            } else {
+                return f32!(0.0);
+            }
+        }
+
+        F32(if self.sign { 1u32 << 31 } else { 0u32 }
+            | ((self.exp + 127) as u32) << 23
+            | ((self.frac >> 9) & 0x7fffff) as u32)
+    }
+}
+
+impl<'a> TryFrom<of::Property<'a>> for F32 {
+    type Error = Error;
+
+    fn try_from(p: of::Property<'_>) -> core::result::Result<F32, Self::Error> {
+        let bits: u32 = p.try_into()?;
+        Ok(F32::from_bits(bits))
+    }
+}
+
+impl of::PropertyUnit for F32 {
+    const UNIT_SIZE: usize = 4;
+
+    fn from_bytes(data: &[u8]) -> Result<Self> {
+        Ok(F32::from_bits(<u32 as of::PropertyUnit>::from_bytes(data)?))
+    }
+}
+
+// TODO: Make this an actual test and figure out how to make it run.
+#[cfg(test)]
+mod tests {
+    #[test]
+    fn test_all() {
+        fn add(a: f32, b: f32) {
+            println!(
+                "{} + {} = {} {}",
+                a,
+                b,
+                (F32::from_f32(a) + F32::from_f32(b)).to_f32(),
+                a + b
+            );
+        }
+        fn sub(a: f32, b: f32) {
+            println!(
+                "{} - {} = {} {}",
+                a,
+                b,
+                (F32::from_f32(a) - F32::from_f32(b)).to_f32(),
+                a - b
+            );
+        }
+        fn mul(a: f32, b: f32) {
+            println!(
+                "{} * {} = {} {}",
+                a,
+                b,
+                (F32::from_f32(a) * F32::from_f32(b)).to_f32(),
+                a * b
+            );
+        }
+        fn div(a: f32, b: f32) {
+            println!(
+                "{} / {} = {} {}",
+                a,
+                b,
+                (F32::from_f32(a) / F32::from_f32(b)).to_f32(),
+                a / b
+            );
+        }
+
+        fn test(a: f32, b: f32) {
+            add(a, b);
+            sub(a, b);
+            mul(a, b);
+            div(a, b);
+        }
+
+        test(1.123, 7.567);
+        test(1.123, 1.456);
+        test(7.567, 1.123);
+        test(1.123, -7.567);
+        test(1.123, -1.456);
+        test(7.567, -1.123);
+        test(-1.123, -7.567);
+        test(-1.123, -1.456);
+        test(-7.567, -1.123);
+        test(1000.123, 0.001);
+        test(1000.123, 0.0000001);
+        test(0.0012, 1000.123);
+        test(0.0000001, 1000.123);
+        test(0., 0.);
+        test(0., 1.);
+        test(1., 0.);
+        test(1., 1.);
+        test(2., f32::INFINITY);
+        test(2., f32::NEG_INFINITY);
+        test(f32::INFINITY, 2.);
+        test(f32::NEG_INFINITY, 2.);
+        test(f32::NEG_INFINITY, 2.);
+        test(f32::MAX, 2.);
+        test(f32::MIN, 2.);
+        test(f32::MIN_POSITIVE, 2.);
+        test(2., f32::MAX);
+        test(2., f32::MIN);
+        test(2., f32::MIN_POSITIVE);
+    }
+}
diff --git a/drivers/gpu/drm/asahi/fw/buffer.rs b/drivers/gpu/drm/asahi/fw/buffer.rs
new file mode 100644
index 000000000000..a8a467879518
--- /dev/null
+++ b/drivers/gpu/drm/asahi/fw/buffer.rs
@@ -0,0 +1,170 @@
+// SPDX-License-Identifier: GPL-2.0-only OR MIT
+
+//! GPU tiled vertex buffer control firmware structures
+
+use super::types::*;
+use super::workqueue;
+use crate::{default_zeroed, no_debug, trivial_gpustruct};
+use kernel::sync::Arc;
+
+pub(crate) mod raw {
+    use super::*;
+
+    #[derive(Debug)]
+    #[repr(C)]
+    pub(crate) struct BlockControl {
+        pub(crate) total: AtomicU32,
+        pub(crate) wptr: AtomicU32,
+        pub(crate) unk: AtomicU32,
+        pub(crate) pad: Pad<0x34>,
+    }
+    default_zeroed!(BlockControl);
+
+    #[derive(Debug)]
+    #[repr(C)]
+    pub(crate) struct Counter {
+        pub(crate) count: AtomicU32,
+        __pad: Pad<0x3c>,
+    }
+    default_zeroed!(Counter);
+
+    #[derive(Debug, Default)]
+    #[repr(C)]
+    pub(crate) struct Stats {
+        pub(crate) max_pages: AtomicU32,
+        pub(crate) max_b: AtomicU32,
+        pub(crate) overflow_count: AtomicU32,
+        pub(crate) gpu_c: AtomicU32,
+        pub(crate) __pad0: Pad<0x10>,
+        pub(crate) reset: AtomicU32,
+        pub(crate) __pad1: Pad<0x1c>,
+    }
+
+    #[versions(AGX)]
+    #[derive(Debug)]
+    #[repr(C)]
+    pub(crate) struct Info<'a> {
+        pub(crate) gpu_counter: u32,
+        pub(crate) unk_4: u32,
+        pub(crate) last_id: i32,
+        pub(crate) cur_id: i32,
+        pub(crate) unk_10: u32,
+        pub(crate) gpu_counter2: u32,
+        pub(crate) unk_18: u32,
+
+        #[ver(V < V13_0B4)]
+        pub(crate) unk_1c: u32,
+
+        pub(crate) page_list: GpuPointer<'a, &'a [u32]>,
+        pub(crate) page_list_size: u32,
+        pub(crate) page_count: AtomicU32,
+        pub(crate) max_blocks: u32,
+        pub(crate) block_count: AtomicU32,
+        pub(crate) unk_38: u32,
+        pub(crate) block_list: GpuPointer<'a, &'a [u32]>,
+        pub(crate) block_ctl: GpuPointer<'a, super::BlockControl>,
+        pub(crate) last_page: AtomicU32,
+        pub(crate) gpu_page_ptr1: u32,
+        pub(crate) gpu_page_ptr2: u32,
+        pub(crate) unk_58: u32,
+        pub(crate) block_size: u32,
+        pub(crate) unk_60: U64,
+        pub(crate) counter: GpuPointer<'a, super::Counter>,
+        pub(crate) unk_70: u32,
+        pub(crate) unk_74: u32,
+        pub(crate) unk_78: u32,
+        pub(crate) unk_7c: u32,
+        pub(crate) unk_80: u32,
+        pub(crate) max_pages: u32,
+        pub(crate) max_pages_nomemless: u32,
+        pub(crate) unk_8c: u32,
+        pub(crate) unk_90: Array<0x30, u8>,
+    }
+
+    #[derive(Debug)]
+    #[repr(C)]
+    pub(crate) struct Scene<'a> {
+        pub(crate) pass_page_count: AtomicU32,
+        pub(crate) unk_4: u32,
+        pub(crate) unk_8: U64,
+        pub(crate) unk_10: U64,
+        pub(crate) user_buffer: GpuPointer<'a, &'a [u8]>,
+        pub(crate) unk_20: u32,
+        pub(crate) stats: GpuWeakPointer<super::Stats>,
+        pub(crate) total_page_count: AtomicU32,
+        pub(crate) unk_30: U64, // pad
+        pub(crate) unk_38: U64, // pad
+    }
+
+    #[versions(AGX)]
+    #[derive(Debug)]
+    #[repr(C)]
+    pub(crate) struct InitBuffer<'a> {
+        pub(crate) tag: workqueue::CommandType,
+        pub(crate) vm_slot: u32,
+        pub(crate) buffer_slot: u32,
+        pub(crate) unk_c: u32,
+        pub(crate) block_count: u32,
+        pub(crate) buffer: GpuPointer<'a, super::Info::ver>,
+        pub(crate) stamp_value: EventValue,
+    }
+}
+
+trivial_gpustruct!(BlockControl);
+trivial_gpustruct!(Counter);
+trivial_gpustruct!(Stats);
+
+#[versions(AGX)]
+#[derive(Debug)]
+pub(crate) struct Info {
+    pub(crate) block_ctl: GpuObject<BlockControl>,
+    pub(crate) counter: GpuObject<Counter>,
+    pub(crate) page_list: GpuArray<u32>,
+    pub(crate) block_list: GpuArray<u32>,
+}
+
+#[versions(AGX)]
+impl GpuStruct for Info::ver {
+    type Raw<'a> = raw::Info::ver<'a>;
+}
+
+pub(crate) struct ClusterBuffers {
+    pub(crate) tilemaps: GpuArray<u8>,
+    pub(crate) meta: GpuArray<u8>,
+}
+
+#[versions(AGX)]
+pub(crate) struct Scene {
+    pub(crate) user_buffer: GpuArray<u8>,
+    pub(crate) buffer: crate::buffer::Buffer::ver,
+    pub(crate) tvb_heapmeta: GpuArray<u8>,
+    pub(crate) tvb_tilemap: GpuArray<u8>,
+    pub(crate) tpc: Arc<GpuArray<u8>>,
+    pub(crate) clustering: Option<ClusterBuffers>,
+    pub(crate) preempt_buf: GpuArray<u8>,
+    pub(crate) seq_buf: GpuArray<u64>,
+}
+
+#[versions(AGX)]
+no_debug!(Scene::ver);
+
+#[versions(AGX)]
+impl GpuStruct for Scene::ver {
+    type Raw<'a> = raw::Scene<'a>;
+}
+
+#[versions(AGX)]
+pub(crate) struct InitBuffer {
+    pub(crate) scene: Arc<crate::buffer::Scene::ver>,
+}
+
+#[versions(AGX)]
+no_debug!(InitBuffer::ver);
+
+#[versions(AGX)]
+impl workqueue::Command for InitBuffer::ver {}
+
+#[versions(AGX)]
+impl GpuStruct for InitBuffer::ver {
+    type Raw<'a> = raw::InitBuffer::ver<'a>;
+}
diff --git a/drivers/gpu/drm/asahi/fw/channels.rs b/drivers/gpu/drm/asahi/fw/channels.rs
new file mode 100644
index 000000000000..db5ac9a3ded5
--- /dev/null
+++ b/drivers/gpu/drm/asahi/fw/channels.rs
@@ -0,0 +1,385 @@
+// SPDX-License-Identifier: GPL-2.0-only OR MIT
+
+//! GPU communication channel firmware structures (ring buffers)
+
+use super::types::*;
+use crate::default_zeroed;
+use core::sync::atomic::Ordering;
+
+pub(crate) mod raw {
+    use super::*;
+
+    #[derive(Debug)]
+    #[repr(C)]
+    pub(crate) struct ChannelState<'a> {
+        pub(crate) read_ptr: AtomicU32,
+        __pad0: Pad<0x1c>,
+        pub(crate) write_ptr: AtomicU32,
+        __pad1: Pad<0xc>,
+        _p: PhantomData<&'a ()>,
+    }
+    default_zeroed!(<'a>, ChannelState<'a>);
+
+    #[derive(Debug)]
+    #[repr(C)]
+    pub(crate) struct FwCtlChannelState<'a> {
+        pub(crate) read_ptr: AtomicU32,
+        __pad0: Pad<0xc>,
+        pub(crate) write_ptr: AtomicU32,
+        __pad1: Pad<0xc>,
+        _p: PhantomData<&'a ()>,
+    }
+    default_zeroed!(<'a>, FwCtlChannelState<'a>);
+}
+
+pub(crate) trait RxChannelState: GpuStruct + Debug + Default
+where
+    for<'a> <Self as GpuStruct>::Raw<'a>: Default + Zeroed,
+{
+    const SUB_CHANNELS: usize;
+
+    fn wptr(raw: &Self::Raw<'_>, index: usize) -> u32;
+    fn set_rptr(raw: &Self::Raw<'_>, index: usize, rptr: u32);
+}
+
+#[derive(Debug, Default)]
+pub(crate) struct ChannelState {}
+
+impl GpuStruct for ChannelState {
+    type Raw<'a> = raw::ChannelState<'a>;
+}
+
+impl RxChannelState for ChannelState {
+    const SUB_CHANNELS: usize = 1;
+
+    fn wptr(raw: &Self::Raw<'_>, _index: usize) -> u32 {
+        raw.write_ptr.load(Ordering::Acquire)
+    }
+
+    fn set_rptr(raw: &Self::Raw<'_>, _index: usize, rptr: u32) {
+        raw.read_ptr.store(rptr, Ordering::Release);
+    }
+}
+
+#[derive(Debug, Default)]
+pub(crate) struct FwLogChannelState {}
+
+impl GpuStruct for FwLogChannelState {
+    type Raw<'a> = Array<6, raw::ChannelState<'a>>;
+}
+
+impl RxChannelState for FwLogChannelState {
+    const SUB_CHANNELS: usize = 6;
+
+    fn wptr(raw: &Self::Raw<'_>, index: usize) -> u32 {
+        raw[index].write_ptr.load(Ordering::Acquire)
+    }
+
+    fn set_rptr(raw: &Self::Raw<'_>, index: usize, rptr: u32) {
+        raw[index].read_ptr.store(rptr, Ordering::Release);
+    }
+}
+
+#[derive(Debug, Default)]
+pub(crate) struct FwCtlChannelState {}
+
+impl GpuStruct for FwCtlChannelState {
+    type Raw<'a> = raw::FwCtlChannelState<'a>;
+}
+
+pub(crate) trait TxChannelState: GpuStruct + Debug + Default {
+    fn rptr(raw: &Self::Raw<'_>) -> u32;
+    fn set_wptr(raw: &Self::Raw<'_>, wptr: u32);
+}
+
+impl TxChannelState for ChannelState {
+    fn rptr(raw: &Self::Raw<'_>) -> u32 {
+        raw.read_ptr.load(Ordering::Acquire)
+    }
+
+    fn set_wptr(raw: &Self::Raw<'_>, wptr: u32) {
+        raw.write_ptr.store(wptr, Ordering::Release);
+    }
+}
+
+impl TxChannelState for FwCtlChannelState {
+    fn rptr(raw: &Self::Raw<'_>) -> u32 {
+        raw.read_ptr.load(Ordering::Acquire)
+    }
+
+    fn set_wptr(raw: &Self::Raw<'_>, wptr: u32) {
+        raw.write_ptr.store(wptr, Ordering::Release);
+    }
+}
+
+#[derive(Debug, Copy, Clone, Default)]
+#[repr(u32)]
+pub(crate) enum PipeType {
+    #[default]
+    Vertex = 0,
+    Fragment = 1,
+    Compute = 2,
+}
+
+#[versions(AGX)]
+#[derive(Debug, Copy, Clone, Default)]
+#[repr(C)]
+pub(crate) struct RunWorkQueueMsg {
+    pub(crate) pipe_type: PipeType,
+    pub(crate) work_queue: Option<GpuWeakPointer<super::workqueue::QueueInfo::ver>>,
+    pub(crate) wptr: u32,
+    pub(crate) event_slot: u32,
+    pub(crate) is_new: bool,
+    #[ver(V >= V13_2 && G >= G14)]
+    pub(crate) __pad: Pad<0x2b>,
+    #[ver(V < V13_2 || G < G14)]
+    pub(crate) __pad: Pad<0x1b>,
+}
+
+#[versions(AGX)]
+pub(crate) type PipeMsg = RunWorkQueueMsg::ver;
+
+#[versions(AGX)]
+pub(crate) const DEVICECONTROL_SZ: usize = {
+    #[ver(V < V13_2 || G < G14)]
+    {
+        0x2c
+    }
+    #[ver(V >= V13_2 && G >= G14)]
+    {
+        0x3c
+    }
+};
+
+// TODO: clean up when arbitrary_enum_discriminant is stable
+// https://github.com/rust-lang/rust/issues/60553
+
+#[versions(AGX)]
+#[derive(Debug, Copy, Clone)]
+#[repr(C, u32)]
+#[allow(dead_code)]
+pub(crate) enum DeviceControlMsg {
+    Unk00(Array<DEVICECONTROL_SZ::ver, u8>),
+    Unk01(Array<DEVICECONTROL_SZ::ver, u8>),
+    Unk02(Array<DEVICECONTROL_SZ::ver, u8>),
+    Unk03(Array<DEVICECONTROL_SZ::ver, u8>),
+    Unk04(Array<DEVICECONTROL_SZ::ver, u8>),
+    Unk05(Array<DEVICECONTROL_SZ::ver, u8>),
+    Unk06(Array<DEVICECONTROL_SZ::ver, u8>),
+    Unk07(Array<DEVICECONTROL_SZ::ver, u8>),
+    Unk08(Array<DEVICECONTROL_SZ::ver, u8>),
+    Unk09(Array<DEVICECONTROL_SZ::ver, u8>),
+    Unk0a(Array<DEVICECONTROL_SZ::ver, u8>),
+    Unk0b(Array<DEVICECONTROL_SZ::ver, u8>),
+    Unk0c(Array<DEVICECONTROL_SZ::ver, u8>),
+    Unk0d(Array<DEVICECONTROL_SZ::ver, u8>),
+    Unk0e(Array<DEVICECONTROL_SZ::ver, u8>),
+    Unk0f(Array<DEVICECONTROL_SZ::ver, u8>),
+    Unk10(Array<DEVICECONTROL_SZ::ver, u8>),
+    Unk11(Array<DEVICECONTROL_SZ::ver, u8>),
+    Unk12(Array<DEVICECONTROL_SZ::ver, u8>),
+    Unk13(Array<DEVICECONTROL_SZ::ver, u8>),
+    Unk14(Array<DEVICECONTROL_SZ::ver, u8>),
+    Unk15(Array<DEVICECONTROL_SZ::ver, u8>),
+    Unk16(Array<DEVICECONTROL_SZ::ver, u8>),
+    DestroyContext {
+        unk_4: u32,
+        ctx_23: u8,
+        __pad0: Pad<3>,
+        unk_c: u32,
+        unk_10: u32,
+        ctx_0: u8,
+        ctx_1: u8,
+        ctx_4: u8,
+        __pad1: Pad<1>,
+        unk_18: u32,
+        gpu_context: Option<GpuWeakPointer<super::workqueue::GpuContextData>>,
+        __pad2: Pad<{ DEVICECONTROL_SZ::ver - 0x20 }>,
+    },
+    Unk18(Array<DEVICECONTROL_SZ::ver, u8>),
+    Initialize(Pad<DEVICECONTROL_SZ::ver>),
+}
+
+#[versions(AGX)]
+default_zeroed!(DeviceControlMsg::ver);
+
+#[derive(Copy, Clone, Default, Debug)]
+#[repr(C)]
+#[allow(dead_code)]
+pub(crate) struct FwCtlMsg {
+    pub(crate) addr: U64,
+    pub(crate) unk_8: u32,
+    pub(crate) slot: u32,
+    pub(crate) page_count: u16,
+    pub(crate) unk_12: u16,
+}
+
+pub(crate) const EVENT_SZ: usize = 0x34;
+
+#[derive(Debug, Copy, Clone)]
+#[repr(C, u32)]
+#[allow(dead_code)]
+pub(crate) enum EventMsg {
+    Fault,
+    Flag {
+        firing: [u32; 4],
+        unk_14: u16,
+    },
+    Unk2(Array<EVENT_SZ, u8>),
+    Unk3(Array<EVENT_SZ, u8>),
+    Timeout {
+        counter: u32,
+        unk_8: u32,
+        event_slot: u32,
+    }, // Max discriminant: 0x4
+}
+
+pub(crate) const EVENT_MAX: u32 = 0x4;
+
+#[derive(Copy, Clone)]
+#[repr(C)]
+pub(crate) union RawEventMsg {
+    pub(crate) raw: (u32, Array<EVENT_SZ, u8>),
+    pub(crate) msg: EventMsg,
+}
+
+default_zeroed!(RawEventMsg);
+
+#[derive(Debug, Copy, Clone, Default)]
+#[repr(C)]
+pub(crate) struct RawFwLogMsg {
+    pub(crate) msg_type: u32,
+    __pad0: u32,
+    pub(crate) msg_index: U64,
+    __pad1: Pad<0x28>,
+}
+
+#[derive(Debug, Copy, Clone, Default)]
+#[repr(C)]
+pub(crate) struct RawFwLogPayloadMsg {
+    pub(crate) msg_type: u32,
+    pub(crate) seq_no: u32,
+    pub(crate) timestamp: U64,
+    pub(crate) msg: Array<0xc8, u8>,
+}
+
+#[derive(Debug, Copy, Clone, Default)]
+#[repr(C)]
+pub(crate) struct RawKTraceMsg {
+    pub(crate) msg_type: u32,
+    pub(crate) timestamp: U64,
+    pub(crate) args: Array<4, U64>,
+    pub(crate) code: u8,
+    pub(crate) channel: u8,
+    __pad: Pad<1>,
+    pub(crate) thread: u8,
+    pub(crate) unk_flag: U64,
+}
+
+#[versions(AGX)]
+pub(crate) const STATS_SZ: usize = {
+    #[ver(V < V13_0B4)]
+    {
+        0x2c
+    }
+    #[ver(V >= V13_0B4)]
+    {
+        0x3c
+    }
+};
+
+#[versions(AGX)]
+#[derive(Debug, Copy, Clone)]
+#[repr(C, u32)]
+#[allow(dead_code)]
+pub(crate) enum StatsMsg {
+    Power {
+        // 0x00
+        __pad: Pad<0x18>,
+        power: U64,
+    },
+    Unk1(Array<{ STATS_SZ::ver }, u8>),
+    PowerOn {
+        // 0x02
+        off_time: U64,
+    },
+    PowerOff {
+        // 0x03
+        on_time: U64,
+    },
+    Utilization {
+        // 0x04
+        timestamp: U64,
+        util1: u32,
+        util2: u32,
+        util3: u32,
+        util4: u32,
+    },
+    Unk5(Array<{ STATS_SZ::ver }, u8>),
+    Unk6(Array<{ STATS_SZ::ver }, u8>),
+    Unk7(Array<{ STATS_SZ::ver }, u8>),
+    Unk8(Array<{ STATS_SZ::ver }, u8>),
+    AvgPower {
+        // 0x09
+        active_cs: U64,
+        unk2: u32,
+        unk3: u32,
+        unk4: u32,
+        avg_power: u32,
+    },
+    Temperature {
+        // 0x0a
+        __pad: Pad<0x8>,
+        raw_value: u32,
+        scale: u32,
+        tmin: u32,
+        tmax: u32,
+    },
+    PowerState {
+        // 0x0b
+        timestamp: U64,
+        last_busy_ts: U64,
+        active: u32,
+        poweroff: u32,
+        unk1: u32,
+        pstate: u32,
+        unk2: u32,
+        unk3: u32,
+    },
+    FwBusy {
+        // 0x0c
+        timestamp: U64,
+        busy: u32,
+    },
+    PState {
+        // 0x0d
+        __pad: Pad<0x8>,
+        ps_min: u32,
+        unk1: u32,
+        ps_max: u32,
+        unk2: u32,
+    },
+    TempSensor {
+        // 0x0e
+        __pad: Pad<0x4>,
+        sensor_id: u32,
+        raw_value: u32,
+        scale: u32,
+        tmin: u32,
+        tmax: u32,
+    }, // Max discriminant: 0xe
+}
+
+#[versions(AGX)]
+pub(crate) const STATS_MAX: u32 = 0xe;
+
+#[versions(AGX)]
+#[derive(Copy, Clone)]
+#[repr(C)]
+pub(crate) union RawStatsMsg {
+    pub(crate) raw: (u32, Array<{ STATS_SZ::ver }, u8>),
+    pub(crate) msg: StatsMsg::ver,
+}
+
+#[versions(AGX)]
+default_zeroed!(RawStatsMsg::ver);
diff --git a/drivers/gpu/drm/asahi/fw/compute.rs b/drivers/gpu/drm/asahi/fw/compute.rs
new file mode 100644
index 000000000000..0dbcd77c5e3e
--- /dev/null
+++ b/drivers/gpu/drm/asahi/fw/compute.rs
@@ -0,0 +1,107 @@
+// SPDX-License-Identifier: GPL-2.0-only OR MIT
+
+//! GPU compute job firmware structures
+
+use super::types::*;
+use super::{event, job, workqueue};
+use crate::{microseq, mmu};
+use kernel::sync::Arc;
+
+pub(crate) mod raw {
+    use super::*;
+
+    #[derive(Debug)]
+    #[repr(C)]
+    pub(crate) struct JobParameters1<'a> {
+        pub(crate) preempt_buf1: GpuPointer<'a, &'a [u8]>,
+        pub(crate) encoder: U64,
+        pub(crate) preempt_buf2: GpuPointer<'a, &'a [u8]>,
+        pub(crate) preempt_buf3: GpuPointer<'a, &'a [u8]>,
+        pub(crate) preempt_buf4: GpuPointer<'a, &'a [u8]>,
+        pub(crate) preempt_buf5: GpuPointer<'a, &'a [u8]>,
+        pub(crate) pipeline_base: U64,
+        pub(crate) unk_38: U64,
+        pub(crate) unk_40: u32,
+        pub(crate) unk_44: u32,
+        pub(crate) compute_layout_addr: U64,
+        pub(crate) unk_50: u32,
+        pub(crate) unk_54: u32,
+        pub(crate) unk_58: u32,
+        pub(crate) unk_5c: u32,
+        pub(crate) iogpu_unk_40: u32,
+    }
+
+    #[versions(AGX)]
+    #[derive(Debug)]
+    #[repr(C)]
+    pub(crate) struct JobParameters2<'a> {
+        #[ver(V >= V13_0B4)]
+        pub(crate) unk_0_0: u32,
+        pub(crate) unk_0: Array<0x24, u8>,
+        pub(crate) preempt_buf1: GpuPointer<'a, &'a [u8]>,
+        pub(crate) encoder_end: U64,
+        pub(crate) unk_34: Array<0x28, u8>,
+        #[ver(V < V13_0B4)]
+        pub(crate) unk_5c: u32,
+    }
+
+    #[versions(AGX)]
+    #[derive(Debug)]
+    #[repr(C)]
+    pub(crate) struct RunCompute<'a> {
+        pub(crate) tag: workqueue::CommandType,
+
+        #[ver(V >= V13_0B4)]
+        pub(crate) counter: U64,
+
+        pub(crate) unk_4: u32,
+        pub(crate) vm_slot: u32,
+        pub(crate) notifier: GpuPointer<'a, event::Notifier::ver>,
+        pub(crate) unk_pointee: Array<0x54, u8>,
+        pub(crate) job_params1: JobParameters1<'a>,
+        pub(crate) unk_b8: Array<0x11c, u8>,
+        pub(crate) microsequence: GpuPointer<'a, &'a [u8]>,
+        pub(crate) microsequence_size: u32,
+        pub(crate) job_params2: JobParameters2::ver<'a>,
+        pub(crate) encoder_params: job::raw::EncoderParams<'a>,
+        pub(crate) meta: job::raw::JobMeta,
+        pub(crate) cur_ts: U64,
+        pub(crate) start_ts: Option<GpuPointer<'a, AtomicU64>>,
+        pub(crate) end_ts: Option<GpuPointer<'a, AtomicU64>>,
+        pub(crate) unk_2c0: u32,
+        pub(crate) unk_2c4: u32,
+        pub(crate) unk_2c8: u32,
+        pub(crate) unk_2cc: u32,
+        pub(crate) client_sequence: u8,
+        pub(crate) pad_2d1: Array<3, u8>,
+        pub(crate) unk_2d4: u32,
+        pub(crate) unk_2d8: u8,
+        #[ver(V >= V13_0B4)]
+        pub(crate) unk_ts: U64,
+        #[ver(V >= V13_0B4)]
+        pub(crate) unk_2e1: Array<0x1c, u8>,
+        #[ver(V >= V13_0B4)]
+        pub(crate) unk_flag: U32,
+        #[ver(V >= V13_0B4)]
+        pub(crate) unk_pad: Array<0x10, u8>,
+    }
+}
+
+#[versions(AGX)]
+#[derive(Debug)]
+pub(crate) struct RunCompute {
+    pub(crate) notifier: Arc<GpuObject<event::Notifier::ver>>,
+    pub(crate) preempt_buf: GpuArray<u8>,
+    pub(crate) seq_buf: GpuArray<u64>,
+    pub(crate) micro_seq: microseq::MicroSequence,
+    pub(crate) vm_bind: mmu::VmBind,
+    pub(crate) timestamps: Arc<GpuObject<job::JobTimestamps>>,
+}
+
+#[versions(AGX)]
+impl GpuStruct for RunCompute::ver {
+    type Raw<'a> = raw::RunCompute::ver<'a>;
+}
+
+#[versions(AGX)]
+impl workqueue::Command for RunCompute::ver {}
diff --git a/drivers/gpu/drm/asahi/fw/event.rs b/drivers/gpu/drm/asahi/fw/event.rs
new file mode 100644
index 000000000000..fbf65ab6d976
--- /dev/null
+++ b/drivers/gpu/drm/asahi/fw/event.rs
@@ -0,0 +1,100 @@
+// SPDX-License-Identifier: GPL-2.0-only OR MIT
+
+//! GPU events control structures & stamps
+
+use super::types::*;
+use crate::{default_zeroed, trivial_gpustruct};
+use core::sync::atomic::Ordering;
+
+pub(crate) mod raw {
+    use super::*;
+
+    #[derive(Debug, Clone, Copy, Default)]
+    #[repr(C)]
+    pub(crate) struct LinkedListHead {
+        pub(crate) prev: Option<GpuWeakPointer<LinkedListHead>>,
+        pub(crate) next: Option<GpuWeakPointer<LinkedListHead>>,
+    }
+
+    #[derive(Debug, Clone, Copy)]
+    #[repr(C)]
+    pub(crate) struct NotifierList {
+        pub(crate) list_head: LinkedListHead,
+        pub(crate) unkptr_10: U64,
+    }
+    default_zeroed!(NotifierList);
+
+    #[versions(AGX)]
+    #[derive(Debug, Clone, Copy)]
+    #[repr(C)]
+    pub(crate) struct NotifierState {
+        unk_14: u32,
+        unk_18: U64,
+        unk_20: u32,
+        vm_slot: u32,
+        has_vtx: u32,
+        pstamp_vtx: Array<4, U64>,
+        has_frag: u32,
+        pstamp_frag: Array<4, U64>,
+        has_comp: u32,
+        pstamp_comp: Array<4, U64>,
+        #[ver(G >= G14 && V < V13_0B4)]
+        unk_98_g14_0: Array<0x14, u8>,
+        in_list: u32,
+        list_head: LinkedListHead,
+        #[ver(G >= G14 && V < V13_0B4)]
+        unk_a8_g14_0: Pad<4>,
+        #[ver(V >= V13_0B4)]
+        pub(crate) unk_buf: Array<0x8, u8>, // Init to all-ff
+    }
+
+    #[versions(AGX)]
+    impl Default for NotifierState::ver {
+        fn default() -> Self {
+            #[allow(unused_mut)]
+            let mut s: Self = unsafe { core::mem::zeroed() };
+            #[ver(V >= V13_0B4)]
+            s.unk_buf = Array::new([0xff; 0x8]);
+            s
+        }
+    }
+
+    #[derive(Debug)]
+    #[repr(transparent)]
+    pub(crate) struct Threshold(AtomicU64);
+    default_zeroed!(Threshold);
+
+    impl Threshold {
+        pub(crate) fn increment(&self) {
+            // We could use fetch_add, but the non-LSE atomic
+            // sequence Rust produces confuses the hypervisor.
+            let v = self.0.load(Ordering::Relaxed);
+            self.0.store(v + 1, Ordering::Relaxed);
+        }
+    }
+
+    #[versions(AGX)]
+    #[derive(Debug)]
+    #[repr(C)]
+    pub(crate) struct Notifier<'a> {
+        pub(crate) threshold: GpuPointer<'a, super::Threshold>,
+        pub(crate) generation: AtomicU32,
+        pub(crate) cur_count: AtomicU32,
+        pub(crate) unk_10: AtomicU32,
+        pub(crate) state: NotifierState::ver,
+    }
+}
+
+trivial_gpustruct!(Threshold);
+trivial_gpustruct!(NotifierList);
+
+#[versions(AGX)]
+#[derive(Debug)]
+pub(crate) struct Notifier {
+    pub(crate) threshold: GpuObject<Threshold>,
+}
+
+#[versions(AGX)]
+impl GpuStruct for Notifier::ver {
+    type Raw<'a> = raw::Notifier::ver<'a>;
+}
diff --git a/drivers/gpu/drm/asahi/fw/fragment.rs b/drivers/gpu/drm/asahi/fw/fragment.rs
new file mode 100644
index 000000000000..eca275efb967
--- /dev/null
+++ b/drivers/gpu/drm/asahi/fw/fragment.rs
@@ -0,0 +1,276 @@
+// SPDX-License-Identifier: GPL-2.0-only OR MIT
+
+//! GPU fragment job firmware structures
+
+use super::types::*;
+use super::{event, job, workqueue};
+use crate::{buffer, fw, microseq, mmu};
+use kernel::sync::Arc;
+
+pub(crate) mod raw {
+    use super::*;
+
+    #[derive(Debug, Clone, Copy)]
+    #[repr(C)]
+    pub(crate) struct ClearPipelineBinding {
+        pub(crate) pipeline_bind: U64,
+        pub(crate) address: U64,
+    }
+
+    #[derive(Debug, Clone, Copy, Default)]
+    #[repr(C)]
+    pub(crate) struct StorePipelineBinding {
+        pub(crate) unk_0: U64,
+        pub(crate) unk_8: u32,
+        pub(crate) pipeline_bind: u32,
+        pub(crate) unk_10: u32,
+        pub(crate) address: u32,
+        pub(crate) unk_18: u32,
+        pub(crate) unk_1c_padding: u32,
+    }
+
+    impl StorePipelineBinding {
+        pub(crate) fn new(pipeline_bind: u32, address: u32) -> StorePipelineBinding {
+            StorePipelineBinding {
+                pipeline_bind,
+                address,
+                ..Default::default()
+            }
+        }
+    }
+
+    #[derive(Debug)]
+    #[repr(C)]
+    pub(crate) struct ArrayAddr {
+        pub(crate) ptr: U64,
+        pub(crate) unk_padding: U64,
+    }
+
+    #[versions(AGX)]
+    #[derive(Debug, Clone, Copy)]
+    #[repr(C)]
+    pub(crate) struct AuxFBInfo {
+        pub(crate) iogpu_unk_214: u32,
+        pub(crate) unk2: u32,
+        pub(crate) width: u32,
+        pub(crate) height: u32,
+
+        #[ver(V >= V13_0B4)]
+        pub(crate) unk3: U64,
+    }
+
+    #[versions(AGX)]
+    #[derive(Debug)]
+    #[repr(C)]
+    pub(crate) struct JobParameters1<'a> {
+        pub(crate) utile_config: u32,
+        pub(crate) unk_4: u32,
+        pub(crate) clear_pipeline: ClearPipelineBinding,
+        pub(crate) ppp_multisamplectl: U64,
+        pub(crate) scissor_array: U64,
+        pub(crate) depth_bias_array: U64,
+        pub(crate) aux_fb_info: AuxFBInfo::ver,
+        pub(crate) depth_dimensions: U64,
+        pub(crate) visibility_result_buffer: U64,
+        pub(crate) zls_ctrl: U64,
+
+        #[ver(G >= G14)]
+        pub(crate) unk_58_g14_0: U64,
+        #[ver(G >= G14)]
+        pub(crate) unk_58_g14_8: U64,
+
+        pub(crate) depth_buffer_ptr1: U64,
+        pub(crate) depth_buffer_ptr2: U64,
+        pub(crate) stencil_buffer_ptr1: U64,
+        pub(crate) stencil_buffer_ptr2: U64,
+
+        #[ver(G >= G14)]
+        pub(crate) unk_68_g14_0: Array<0x20, u8>,
+
+        pub(crate) unk_78: Array<0x4, U64>,
+        pub(crate) depth_meta_buffer_ptr1: U64,
+        pub(crate) unk_a0: U64,
+        pub(crate) depth_meta_buffer_ptr2: U64,
+        pub(crate) unk_b0: U64,
+        pub(crate) stencil_meta_buffer_ptr1: U64,
+        pub(crate) unk_c0: U64,
+        pub(crate) stencil_meta_buffer_ptr2: U64,
+        pub(crate) unk_d0: U64,
+        pub(crate) tvb_tilemap: GpuPointer<'a, &'a [u8]>,
+        pub(crate) tvb_heapmeta: GpuPointer<'a, &'a [u8]>,
+        pub(crate) mtile_stride_dwords: U64,
+        pub(crate) tvb_heapmeta_2: GpuPointer<'a, &'a [u8]>,
+        pub(crate) tile_config: U64,
+        pub(crate) aux_fb: GpuPointer<'a, &'a [u8]>,
+        pub(crate) unk_108: Array<0x6, U64>,
+        pub(crate) pipeline_base: U64,
+        pub(crate) unk_140: U64,
+        pub(crate) unk_148: U64,
+        pub(crate) unk_150: U64,
+        pub(crate) unk_158: U64,
+        pub(crate) unk_160: U64,
+
+        #[ver(G < G14)]
+        pub(crate) unk_168_padding: Array<0x1d8, u8>,
+        #[ver(G >= G14)]
+        pub(crate) unk_168_padding: Array<0x1a8, u8>,
+        #[ver(V < V13_0B4)]
+        pub(crate) __pad0: Pad<0x8>,
+    }
+
+    #[derive(Debug)]
+    #[repr(C)]
+    pub(crate) struct JobParameters2 {
+        pub(crate) store_pipeline_bind: u32,
+        pub(crate) store_pipeline_addr: u32,
+        pub(crate) unk_8: u32,
+        pub(crate) unk_c: u32,
+        pub(crate) merge_upper_x: F32,
+        pub(crate) merge_upper_y: F32,
+        pub(crate) unk_18: U64,
+        pub(crate) utiles_per_mtile_y: u16,
+        pub(crate) utiles_per_mtile_x: u16,
+        pub(crate) unk_24: u32,
+        pub(crate) tile_counts: u32,
+        pub(crate) iogpu_unk_212: u32,
+        pub(crate) isp_bgobjdepth: u32,
+        pub(crate) isp_bgobjvals: u32,
+        pub(crate) unk_38: u32,
+        pub(crate) unk_3c: u32,
+        pub(crate) unk_40: u32,
+    }
+
+    #[versions(AGX)]
+    #[derive(Debug)]
+    #[repr(C)]
+    pub(crate) struct JobParameters3 {
+        pub(crate) unk_44_padding: Array<0xac, u8>,
+        pub(crate) depth_bias_array: ArrayAddr,
+        pub(crate) scissor_array: ArrayAddr,
+        pub(crate) visibility_result_buffer: U64,
+        pub(crate) unk_118: U64,
+        pub(crate) unk_120: Array<0x25, U64>,
+        pub(crate) unk_reload_pipeline: ClearPipelineBinding,
+        pub(crate) unk_258: U64,
+        pub(crate) unk_260: U64,
+        pub(crate) unk_268: U64,
+        pub(crate) unk_270: U64,
+        pub(crate) reload_pipeline: ClearPipelineBinding,
+        pub(crate) zls_ctrl: U64,
+        pub(crate) unk_290: U64,
+        pub(crate) depth_buffer_ptr1: U64,
+        pub(crate) unk_2a0: U64,
+        pub(crate) unk_2a8: U64,
+        pub(crate) depth_buffer_ptr2: U64,
+        pub(crate) depth_buffer_ptr3: U64,
+        pub(crate) depth_meta_buffer_ptr3: U64,
+        pub(crate) stencil_buffer_ptr1: U64,
+        pub(crate) unk_2d0: U64,
+        pub(crate) unk_2d8: U64,
+        pub(crate) stencil_buffer_ptr2: U64,
+        pub(crate) stencil_buffer_ptr3: U64,
+        pub(crate) stencil_meta_buffer_ptr3: U64,
+        pub(crate) unk_2f8: Array<2, U64>,
+        pub(crate) iogpu_unk_212: u32,
+        pub(crate) unk_30c: u32,
+        pub(crate) aux_fb_info: AuxFBInfo::ver,
+        pub(crate) unk_320_padding: Array<0x10, u8>,
+        pub(crate) unk_partial_store_pipeline: StorePipelineBinding,
+        pub(crate) partial_store_pipeline: StorePipelineBinding,
+        pub(crate) isp_bgobjdepth: u32,
+        pub(crate) isp_bgobjvals: u32,
+        pub(crate) iogpu_unk_49: u32,
+        pub(crate) unk_37c: u32,
+        pub(crate) unk_380: U64,
+        pub(crate) unk_388: U64,
+
+        #[ver(V >= V13_0B4)]
+        pub(crate) unk_390_0: U64,
+
+        pub(crate) depth_dimensions: U64,
+    }
+
+    #[versions(AGX)]
+    #[derive(Debug)]
+    #[repr(C)]
+    pub(crate) struct RunFragment<'a> {
+        pub(crate) tag: workqueue::CommandType,
+
+        #[ver(V >= V13_0B4)]
+        pub(crate) counter: U64,
+
+        pub(crate) vm_slot: u32,
+        pub(crate) unk_8: u32,
+        pub(crate) microsequence: GpuPointer<'a, &'a [u8]>,
+        pub(crate) microsequence_size: u32,
+        pub(crate) notifier: GpuPointer<'a, event::Notifier::ver>,
+        pub(crate) buffer: GpuPointer<'a, fw::buffer::Info::ver>,
+        pub(crate) scene: GpuPointer<'a, fw::buffer::Scene::ver>,
+        pub(crate) unk_buffer_buf: GpuWeakPointer<[u8]>,
+        pub(crate) tvb_tilemap: GpuPointer<'a, &'a [u8]>,
+        pub(crate) ppp_multisamplectl: U64,
+        pub(crate) samples: u32,
+        pub(crate) tiles_per_mtile_y: u16,
+        pub(crate) tiles_per_mtile_x: u16,
+        pub(crate) unk_50: U64,
+        pub(crate) unk_58: U64,
+        pub(crate) merge_upper_x: F32,
+        pub(crate) merge_upper_y: F32,
+        pub(crate) unk_68: U64,
+        pub(crate) tile_count: U64,
+        pub(crate) job_params1: JobParameters1::ver<'a>,
+        pub(crate) job_params2: JobParameters2,
+        pub(crate) job_params3: JobParameters3::ver,
+        pub(crate) unk_758_flag: u32,
+        pub(crate) unk_75c_flag: u32,
+        pub(crate) unk_buf: Array<0x110, u8>,
+        pub(crate) busy_flag: u32,
+        pub(crate) tvb_overflow_count: u32,
+        pub(crate) unk_878: u32,
+        pub(crate) encoder_params: job::raw::EncoderParams<'a>,
+        pub(crate) process_empty_tiles: u32,
+        pub(crate) no_clear_pipeline_textures: u32,
+        pub(crate) unk_param: u32,
+        pub(crate) unk_pointee: u32,
+        pub(crate) meta: job::raw::JobMeta,
+        pub(crate) unk_after_meta: u32,
+        pub(crate) unk_buf_0: U64,
+        pub(crate) unk_buf_8: U64,
+        pub(crate) unk_buf_10: U64,
+        pub(crate) cur_ts: U64,
+        pub(crate) start_ts: Option<GpuPointer<'a, AtomicU64>>,
+        pub(crate) end_ts: Option<GpuPointer<'a, AtomicU64>>,
+        pub(crate) unk_914: u32,
+        pub(crate) unk_918: U64,
+        pub(crate) unk_920: u32,
+        pub(crate) client_sequence: u8,
+        pub(crate) pad_925: Array<3, u8>,
+        pub(crate) unk_928: u32,
+        pub(crate) unk_92c: u8,
+
+        #[ver(V >= V13_0B4)]
+        pub(crate) unk_ts: U64,
+
+        #[ver(V >= V13_0B4)]
+        pub(crate) unk_92d_8: Array<0x1b, u8>,
+    }
+}
+
+#[versions(AGX)]
+#[derive(Debug)]
+pub(crate) struct RunFragment {
+    pub(crate) notifier: Arc<GpuObject<event::Notifier::ver>>,
+    pub(crate) scene: Arc<buffer::Scene::ver>,
+    pub(crate) micro_seq: microseq::MicroSequence,
+    pub(crate) vm_bind: mmu::VmBind,
+    pub(crate) aux_fb: GpuArray<u8>,
+    pub(crate) timestamps: Arc<GpuObject<job::RenderTimestamps>>,
+}
+
+#[versions(AGX)]
+impl GpuStruct for RunFragment::ver {
+    type Raw<'a> = raw::RunFragment::ver<'a>;
+}
+
+#[versions(AGX)]
+impl workqueue::Command for RunFragment::ver {}
diff --git a/drivers/gpu/drm/asahi/fw/initdata.rs b/drivers/gpu/drm/asahi/fw/initdata.rs
new file mode 100644
index 000000000000..44de0c1cccf3
--- /dev/null
+++ b/drivers/gpu/drm/asahi/fw/initdata.rs
@@ -0,0 +1,1264 @@
+// SPDX-License-Identifier: GPL-2.0-only OR MIT
+
+//! GPU initialization / global structures
+
+use super::channels;
+use super::types::*;
+use crate::{default_zeroed, no_debug, trivial_gpustruct};
+
+pub(crate) mod raw {
+    use super::*;
+
+    #[derive(Debug, Default)]
+    #[repr(C)]
+    pub(crate) struct ChannelRing<T: GpuStruct + Debug + Default, U: Copy> {
+        pub(crate) state: Option<GpuWeakPointer<T>>,
+        pub(crate) ring: Option<GpuWeakPointer<[U]>>,
+    }
+
+    #[versions(AGX)]
+    #[derive(Debug)]
+    #[repr(C)]
+    pub(crate) struct PipeChannels {
+        pub(crate) vtx: ChannelRing<channels::ChannelState, channels::PipeMsg::ver>,
+        pub(crate) frag: ChannelRing<channels::ChannelState, channels::PipeMsg::ver>,
+        pub(crate) comp: ChannelRing<channels::ChannelState, channels::PipeMsg::ver>,
+    }
+    #[versions(AGX)]
+    default_zeroed!(PipeChannels::ver);
+
+    #[derive(Debug, Default)]
+    #[repr(C)]
+    pub(crate) struct FwStatusFlags {
+        pub(crate) halt_count: AtomicU32,
+        __pad0: Pad<0xc>,
+        pub(crate) halted: AtomicU32,
+        __pad1: Pad<0xc>,
+        pub(crate) resume: AtomicU32,
+        __pad2: Pad<0xc>,
+        pub(crate) unk_40: u32,
+        __pad3: Pad<0xc>,
+        pub(crate) unk_ctr: u32,
+        __pad4: Pad<0xc>,
+        pub(crate) unk_60: u32,
+        __pad5: Pad<0xc>,
+        pub(crate) unk_70: u32,
+        __pad6: Pad<0xc>,
+    }
+
+    #[derive(Debug, Default)]
+    #[repr(C)]
+    pub(crate) struct FwStatus {
+        pub(crate) fwctl_channel: ChannelRing<channels::FwCtlChannelState, channels::FwCtlMsg>,
+        pub(crate) flags: FwStatusFlags,
+    }
+
+    #[derive(Debug)]
+    #[repr(C)]
+    pub(crate) struct HwDataShared1 {
+        pub(crate) table: Array<16, i32>,
+        pub(crate) unk_44: Array<0x60, u8>,
+        pub(crate) unk_a4: u32,
+        pub(crate) unk_a8: u32,
+    }
+    default_zeroed!(HwDataShared1);
+
+    #[derive(Debug, Default)]
+    #[repr(C)]
+    pub(crate) struct HwDataShared2Curve {
+        pub(crate) unk_0: u32,
+        pub(crate) unk_4: u32,
+        pub(crate) t1: Array<16, i16>,
+        pub(crate) t2: Array<16, i16>,
+        pub(crate) t3: Array<8, Array<16, i32>>,
+    }
+
+    #[derive(Debug, Default)]
+    #[repr(C)]
+    pub(crate) struct HwDataShared2T8112 {
+        pub(crate) unk_0: Array<5, u32>,
+        pub(crate) unk_14: u32,
+        pub(crate) unk_18: Array<8, u32>,
+        pub(crate) curve1: HwDataShared2Curve,
+        pub(crate) curve2: HwDataShared2Curve,
+    }
+
+    #[derive(Debug)]
+    #[repr(C)]
+    pub(crate) struct HwDataShared2 {
+        pub(crate) table: Array<10, i32>,
+        pub(crate) unk_28: Array<0x10, u8>,
+        pub(crate) t8112: HwDataShared2T8112,
+        pub(crate) unk_500: u32,
+        pub(crate) unk_504: u32,
+        pub(crate) unk_508: u32,
+        pub(crate) unk_50c: u32,
+        pub(crate) unk_510: u32,
+    }
+    default_zeroed!(HwDataShared2);
+
+    #[derive(Debug)]
+    #[repr(C)]
+    pub(crate) struct HwDataShared3 {
+        pub(crate) unk_0: u32,
+        pub(crate) unk_4: u32,
+        pub(crate) unk_8: u32,
+        pub(crate) table: Array<16, u32>,
+        pub(crate) unk_4c: u32,
+    }
+    default_zeroed!(HwDataShared3);
+
+    #[derive(Debug)]
+    #[repr(C)]
+    pub(crate) struct HwDataA130Extra {
+        pub(crate) unk_0: Array<0x38, u8>,
+        pub(crate) unk_38: u32,
+        pub(crate) unk_3c: u32,
+        pub(crate) unk_40: u32,
+        pub(crate) unk_44: u32,
+        pub(crate) unk_48: u32,
+        pub(crate) unk_4c: u32,
+        pub(crate) unk_50: u32,
+        pub(crate) unk_54: u32,
+        pub(crate) unk_58: u32,
+        pub(crate) unk_5c: u32,
+        pub(crate) unk_60: F32,
+        pub(crate) unk_64: F32,
+        pub(crate) unk_68: F32,
+        pub(crate) unk_6c: F32,
+        pub(crate) unk_70: F32,
+        pub(crate) unk_74: F32,
+        pub(crate) unk_78: F32,
+        pub(crate) unk_7c: F32,
+        pub(crate) unk_80: F32,
+        pub(crate) unk_84: F32,
+        pub(crate) unk_88: u32,
+        pub(crate) unk_8c: u32,
+        pub(crate) max_pstate_scaled_1: u32,
+        pub(crate) unk_94: u32,
+        pub(crate) unk_98: u32,
+        pub(crate) unk_9c: F32,
+        pub(crate) unk_a0: u32,
+        pub(crate) unk_a4: u32,
+        pub(crate) unk_a8: u32,
+        pub(crate) unk_ac: u32,
+        pub(crate) unk_b0: u32,
+        pub(crate) unk_b4: u32,
+        pub(crate) unk_b8: u32,
+        pub(crate) unk_bc: u32,
+        pub(crate) unk_c0: u32,
+        pub(crate) unk_c4: F32,
+        pub(crate) unk_c8: Array<0x4c, u8>,
+        pub(crate) unk_114: F32,
+        pub(crate) unk_118: u32,
+        pub(crate) unk_11c: u32,
+        pub(crate) unk_120: u32,
+        pub(crate) unk_124: u32,
+        pub(crate) max_pstate_scaled_2: u32,
+        pub(crate) unk_12c: Array<0x8c, u8>,
+    }
+    default_zeroed!(HwDataA130Extra);
+
+    #[derive(Default)]
+    #[repr(C)]
+    pub(crate) struct T81xxData {
+        pub(crate) unk_d8c: u32,
+        pub(crate) unk_d90: u32,
+        pub(crate) unk_d94: u32,
+        pub(crate) unk_d98: u32,
+        pub(crate) unk_d9c: F32,
+        pub(crate) unk_da0: u32,
+        pub(crate) unk_da4: F32,
+        pub(crate) unk_da8: u32,
+        pub(crate) unk_dac: F32,
+        pub(crate) unk_db0: u32,
+        pub(crate) unk_db4: u32,
+        pub(crate) unk_db8: F32,
+        pub(crate) unk_dbc: F32,
+        pub(crate) unk_dc0: u32,
+        pub(crate) unk_dc4: u32,
+        pub(crate) unk_dc8: u32,
+        pub(crate) max_pstate_scaled: u32,
+    }
+
+    #[versions(AGX)]
+    #[derive(Default, Copy, Clone)]
+    #[repr(C)]
+    pub(crate) struct PowerZone {
+        pub(crate) val: F32,
+        pub(crate) target: u32,
+        pub(crate) target_off: u32,
+        pub(crate) filter_tc_x4: u32,
+        pub(crate) filter_tc_xperiod: u32,
+        #[ver(V >= V13_0B4)]
+        pub(crate) unk_10: u32,
+        #[ver(V >= V13_0B4)]
+        pub(crate) unk_14: u32,
+        pub(crate) filter_a_neg: F32,
+        pub(crate) filter_a: F32,
+        pub(crate) pad: u32,
+    }
+
+    #[versions(AGX)]
+    #[repr(C)]
+    pub(crate) struct HwDataA {
+        pub(crate) unk_0: u32,
+        pub(crate) clocks_per_period: u32,
+
+        #[ver(V >= V13_0B4)]
+        pub(crate) clocks_per_period_2: u32,
+
+        pub(crate) unk_8: u32,
+        pub(crate) pwr_status: AtomicU32,
+        pub(crate) unk_10: F32,
+        pub(crate) unk_14: u32,
+        pub(crate) unk_18: u32,
+        pub(crate) unk_1c: u32,
+        pub(crate) unk_20: u32,
+        pub(crate) unk_24: u32,
+        pub(crate) actual_pstate: u32,
+        pub(crate) tgt_pstate: u32,
+        pub(crate) unk_30: u32,
+        pub(crate) cur_pstate: u32,
+        pub(crate) unk_38: u32,
+
+        #[ver(V >= V13_0B4)]
+        pub(crate) unk_3c_0: u32,
+
+        pub(crate) base_pstate_scaled: u32,
+        pub(crate) unk_40: u32,
+        pub(crate) max_pstate_scaled: u32,
+        pub(crate) unk_48: u32,
+        pub(crate) min_pstate_scaled: u32,
+        pub(crate) freq_mhz: F32,
+        pub(crate) unk_54: Array<0x20, u8>,
+
+        #[ver(V >= V13_0B4)]
+        pub(crate) unk_74_0: u32,
+
+        pub(crate) sram_k: Array<0x10, F32>,
+        pub(crate) unk_b4: Array<0x100, u8>,
+        pub(crate) unk_1b4: u32,
+        pub(crate) temp_c: u32,
+        pub(crate) avg_power_mw: u32,
+        pub(crate) update_ts: U64,
+        pub(crate) unk_1c8: u32,
+        pub(crate) unk_1cc: Array<0x478, u8>,
+        pub(crate) pad_644: Pad<0x8>,
+        pub(crate) unk_64c: u32,
+        pub(crate) unk_650: u32,
+        pub(crate) pad_654: u32,
+        pub(crate) pwr_filter_a_neg: F32,
+        pub(crate) pad_65c: u32,
+        pub(crate) pwr_filter_a: F32,
+        pub(crate) pad_664: u32,
+        pub(crate) pwr_integral_gain: F32,
+        pub(crate) pad_66c: u32,
+        pub(crate) pwr_integral_min_clamp: F32,
+        pub(crate) max_power_1: F32,
+        pub(crate) pwr_proportional_gain: F32,
+        pub(crate) pad_67c: u32,
+        pub(crate) pwr_pstate_related_k: F32,
+        pub(crate) pwr_pstate_max_dc_offset: i32,
+        pub(crate) unk_688: u32,
+        pub(crate) max_pstate_scaled_2: u32,
+        pub(crate) pad_690: u32,
+        pub(crate) unk_694: u32,
+        pub(crate) max_power_2: u32,
+        pub(crate) pad_69c: Pad<0x18>,
+        pub(crate) unk_6b4: u32,
+
+        #[ver(V >= V13_0B4)]
+        pub(crate) unk_6b8_0: Array<0x10, u8>,
+
+        pub(crate) max_pstate_scaled_3: u32,
+        pub(crate) unk_6bc: u32,
+        pub(crate) pad_6c0: Pad<0x14>,
+        pub(crate) ppm_filter_tc_periods_x4: u32,
+        pub(crate) unk_6d8: u32,
+        pub(crate) pad_6dc: u32,
+        pub(crate) ppm_filter_a_neg: F32,
+        pub(crate) pad_6e4: u32,
+        pub(crate) ppm_filter_a: F32,
+        pub(crate) pad_6ec: u32,
+        pub(crate) ppm_ki_dt: F32,
+        pub(crate) pad_6f4: u32,
+        pub(crate) pwr_integral_min_clamp_2: u32,
+        pub(crate) unk_6fc: F32,
+        pub(crate) ppm_kp: F32,
+        pub(crate) pad_704: u32,
+        pub(crate) unk_708: u32,
+        pub(crate) pwr_min_duty_cycle: u32,
+        pub(crate) max_pstate_scaled_4: u32,
+        pub(crate) unk_714: u32,
+        pub(crate) pad_718: u32,
+        pub(crate) unk_71c: F32,
+        pub(crate) max_power_3: u32,
+        pub(crate) cur_power_mw_2: u32,
+        pub(crate) ppm_filter_tc_ms: u32,
+        pub(crate) unk_72c: u32,
+
+        #[ver(V >= V13_0B4)]
+        pub(crate) ppm_filter_tc_clks: u32,
+
+        #[ver(V >= V13_0B4)]
+        pub(crate) unk_730_4: u32,
+
+        #[ver(V >= V13_0B4)]
+        pub(crate) unk_730_8: u32,
+
+        #[ver(V >= V13_0B4)]
+        pub(crate) unk_730_c: u32,
+
+        pub(crate) unk_730: F32,
+        pub(crate) unk_734: u32,
+        pub(crate) unk_738: u32,
+        pub(crate) unk_73c: u32,
+        pub(crate) unk_740: u32,
+        pub(crate) unk_744: u32,
+        pub(crate) unk_748: Array<0x4, F32>,
+        pub(crate) unk_758: u32,
+        pub(crate) perf_tgt_utilization: u32,
+        pub(crate) pad_760: u32,
+        pub(crate) perf_boost_min_util: u32,
+        pub(crate) perf_boost_ce_step: u32,
+        pub(crate) perf_reset_iters: u32,
+        pub(crate) pad_770: u32,
+        pub(crate) unk_774: u32,
+        pub(crate) unk_778: u32,
+        pub(crate) perf_filter_drop_threshold: u32,
+        pub(crate) perf_filter_a_neg: F32,
+        pub(crate) perf_filter_a2_neg: F32,
+        pub(crate) perf_filter_a: F32,
+        pub(crate) perf_filter_a2: F32,
+        pub(crate) perf_ki: F32,
+        pub(crate) perf_ki2: F32,
+        pub(crate) perf_integral_min_clamp: F32,
+        pub(crate) unk_79c: F32,
+        pub(crate) perf_kp: F32,
+        pub(crate) perf_kp2: F32,
+        pub(crate) boost_state_unk_k: F32,
+        pub(crate) base_pstate_scaled_2: u32,
+        pub(crate) max_pstate_scaled_5: u32,
+        pub(crate) base_pstate_scaled_3: u32,
+        pub(crate) pad_7b8: u32,
+        pub(crate) perf_cur_utilization: F32,
+        pub(crate) perf_tgt_utilization_2: u32,
+        pub(crate) pad_7c4: Pad<0x18>,
+        pub(crate) unk_7dc: u32,
+
+        #[ver(V >= V13_0B4)]
+        pub(crate) unk_7e0_0: Array<0x10, u8>,
+
+        pub(crate) base_pstate_scaled_4: u32,
+        pub(crate) pad_7e4: u32,
+        pub(crate) unk_7e8: Array<0x14, u8>,
+        pub(crate) unk_7fc: F32,
+        pub(crate) pwr_min_duty_cycle_2: F32,
+        pub(crate) max_pstate_scaled_6: F32,
+        pub(crate) max_freq_mhz: u32,
+        pub(crate) pad_80c: u32,
+        pub(crate) unk_810: u32,
+        pub(crate) pad_814: u32,
+        pub(crate) pwr_min_duty_cycle_3: u32,
+        pub(crate) unk_81c: u32,
+        pub(crate) pad_820: u32,
+        pub(crate) min_pstate_scaled_4: F32,
+        pub(crate) max_pstate_scaled_7: u32,
+        pub(crate) unk_82c: u32,
+        pub(crate) unk_alpha_neg: F32,
+        pub(crate) unk_alpha: F32,
+        pub(crate) unk_838: u32,
+        pub(crate) unk_83c: u32,
+        pub(crate) pad_840: Pad<0x2c>,
+        pub(crate) unk_86c: u32,
+        pub(crate) fast_die0_sensor_mask: U64,
+        pub(crate) fast_die0_release_temp_cc: u32,
+        pub(crate) unk_87c: i32,
+        pub(crate) unk_880: u32,
+        pub(crate) unk_884: u32,
+        pub(crate) pad_888: u32,
+        pub(crate) unk_88c: u32,
+        pub(crate) pad_890: u32,
+        pub(crate) unk_894: F32,
+        pub(crate) pad_898: u32,
+        pub(crate) fast_die0_ki_dt: F32,
+        pub(crate) pad_8a0: u32,
+        pub(crate) unk_8a4: u32,
+        pub(crate) unk_8a8: F32,
+        pub(crate) fast_die0_kp: F32,
+        pub(crate) pad_8b0: u32,
+        pub(crate) unk_8b4: u32,
+        pub(crate) pwr_min_duty_cycle_4: u32,
+        pub(crate) max_pstate_scaled_8: u32,
+        pub(crate) max_pstate_scaled_9: u32,
+        pub(crate) fast_die0_prop_tgt_delta: u32,
+        pub(crate) unk_8c8: u32,
+        pub(crate) unk_8cc: u32,
+        pub(crate) pad_8d0: Pad<0x14>,
+
+        #[ver(V >= V13_0B4)]
+        pub(crate) unk_8e4_0: Array<0x10, u8>,
+
+        pub(crate) unk_8e4: u32,
+        pub(crate) unk_8e8: u32,
+        pub(crate) max_pstate_scaled_10: u32,
+        pub(crate) unk_8f0: u32,
+        pub(crate) unk_8f4: u32,
+        pub(crate) pad_8f8: u32,
+        pub(crate) pad_8fc: u32,
+        pub(crate) unk_900: Array<0x24, u8>,
+        pub(crate) unk_coef_a1: Array<8, Array<8, F32>>,
+        pub(crate) unk_coef_a2: Array<8, Array<8, F32>>,
+        pub(crate) pad_b24: Pad<0x70>,
+        pub(crate) max_pstate_scaled_11: u32,
+        pub(crate) freq_with_off: u32,
+        pub(crate) unk_b9c: u32,
+        pub(crate) unk_ba0: U64,
+        pub(crate) unk_ba8: U64,
+        pub(crate) unk_bb0: u32,
+        pub(crate) unk_bb4: u32,
+        pub(crate) pad_bb8: Pad<0x74>,
+        pub(crate) unk_c2c: u32,
+        pub(crate) power_zone_count: u32,
+        pub(crate) max_power_4: u32,
+        pub(crate) max_power_5: u32,
+        pub(crate) max_power_6: u32,
+        pub(crate) unk_c40: u32,
+        pub(crate) unk_c44: F32,
+        pub(crate) avg_power_target_filter_a_neg: F32,
+        pub(crate) avg_power_target_filter_a: F32,
+        pub(crate) avg_power_target_filter_tc_x4: u32,
+        pub(crate) avg_power_target_filter_tc_xperiod: u32,
+
+        #[ver(V >= V13_0B4)]
+        pub(crate) avg_power_target_filter_tc_clks: u32,
+
+        #[ver(V >= V13_0B4)]
+        pub(crate) unk_c58_4: u32,
+
+        pub(crate) power_zones: Array<5, PowerZone::ver>,
+        pub(crate) avg_power_filter_tc_periods_x4: u32,
+        pub(crate) unk_cfc: u32,
+        pub(crate) unk_d00: u32,
+        pub(crate) avg_power_filter_a_neg: F32,
+        pub(crate) unk_d08: u32,
+        pub(crate) avg_power_filter_a: F32,
+        pub(crate) unk_d10: u32,
+        pub(crate) avg_power_ki_dt: F32,
+        pub(crate) unk_d18: u32,
+        pub(crate) unk_d1c: u32,
+        pub(crate) unk_d20: F32,
+        pub(crate) avg_power_kp: F32,
+        pub(crate) unk_d28: u32,
+        pub(crate) unk_d2c: u32,
+        pub(crate) avg_power_min_duty_cycle: u32,
+        pub(crate) max_pstate_scaled_12: u32,
+        pub(crate) max_pstate_scaled_13: u32,
+        pub(crate) unk_d3c: u32,
+        pub(crate) max_power_7: F32,
+        pub(crate) max_power_8: u32,
+        pub(crate) unk_d48: u32,
+        pub(crate) avg_power_filter_tc_ms: u32,
+        pub(crate) unk_d50: u32,
+
+        #[ver(V >= V13_0B4)]
+        pub(crate) avg_power_filter_tc_clks: u32,
+
+        #[ver(V >= V13_0B4)]
+        pub(crate) unk_d54_4: Array<0xc, u8>,
+
+        pub(crate) unk_d54: Array<0x10, u8>,
+        pub(crate) max_pstate_scaled_14: u32,
+        pub(crate) unk_d68: Array<0x24, u8>,
+
+        pub(crate) t81xx_data: T81xxData,
+
+        pub(crate) unk_dd0: Array<0x40, u8>,
+
+        #[ver(V >= V13_2)]
+        pub(crate) unk_e10_pad: Array<0x10, u8>,
+
+        #[ver(V >= V13_0B4)]
+        pub(crate) unk_e10_0: HwDataA130Extra,
+
+        pub(crate) unk_e10: Array<0xc, u8>,
+        pub(crate) fast_die0_sensor_mask_2: U64,
+        pub(crate) unk_e24: u32,
+        pub(crate) unk_e28: u32,
+        pub(crate) unk_e2c: Pad<0x1c>,
+        pub(crate) unk_coef_b1: Array<8, Array<8, F32>>,
+        pub(crate) unk_coef_b2: Array<8, Array<8, F32>>,
+        pub(crate) pad_1048: Pad<0x5e4>,
+        pub(crate) fast_die0_sensor_mask_alt: U64,
+        #[ver(V < V13_0B4)]
+        pub(crate) fast_die0_sensor_present: U64,
+
+        pub(crate) unk_163c: u32,
+
+        pub(crate) unk_1640: Array<0x2000, u8>,
+        pub(crate) unk_3640: u32,
+        pub(crate) unk_3644: u32,
+        pub(crate) hws1: HwDataShared1,
+
+        #[ver(V >= V13_0B4)]
+        pub(crate) unk_pad1: Pad<0x20>,
+
+        pub(crate) hws2: HwDataShared2,
+        pub(crate) unk_3c04: u32,
+        pub(crate) hws3: HwDataShared3,
+        pub(crate) unk_3c58: Array<0x3c, u8>,
+        pub(crate) unk_3c94: u32,
+        pub(crate) unk_3c98: U64,
+        pub(crate) unk_3ca0: U64,
+        pub(crate) unk_3ca8: U64,
+        pub(crate) unk_3cb0: U64,
+        pub(crate) ts_last_idle: U64,
+        pub(crate) ts_last_poweron: U64,
+        pub(crate) ts_last_poweroff: U64,
+        pub(crate) unk_3cd0: U64,
+        pub(crate) unk_3cd8: U64,
+
+        #[ver(V >= V13_0B4)]
+        pub(crate) unk_3ce0_0: u32,
+
+        pub(crate) unk_3ce0: u32,
+        pub(crate) unk_3ce4: u32,
+        pub(crate) unk_3ce8: u32,
+        pub(crate) unk_3cec: u32,
+        pub(crate) unk_3cf0: u32,
+        pub(crate) core_leak_coef: Array<8, F32>,
+        pub(crate) sram_leak_coef: Array<8, F32>,
+        pub(crate) unk_3d34: Array<0x38, u8>,
+
+        #[ver(V >= V13_0B4)]
+        pub(crate) unk_3d6c: Array<0x38, u8>,
+    }
+    #[versions(AGX)]
+    default_zeroed!(HwDataA::ver);
+    #[versions(AGX)]
+    no_debug!(HwDataA::ver);
+
+    #[derive(Debug, Default, Clone, Copy)]
+    #[repr(C)]
+    pub(crate) struct IOMapping {
+        pub(crate) phys_addr: U64,
+        pub(crate) virt_addr: U64,
+        pub(crate) size: u32,
+        pub(crate) range_size: u32,
+        pub(crate) readwrite: U64,
+    }
+
+    #[versions(AGX)]
+    #[derive(Debug)]
+    #[repr(C)]
+    pub(crate) struct HwDataB {
+        #[ver(V < V13_0B4)]
+        pub(crate) unk_0: U64,
+
+        pub(crate) unk_8: U64,
+
+        #[ver(V < V13_0B4)]
+        pub(crate) unk_10: U64,
+
+        pub(crate) unk_18: U64,
+        pub(crate) unk_20: U64,
+        pub(crate) unk_28: U64,
+        pub(crate) unk_30: U64,
+        pub(crate) unkptr_38: U64,
+        pub(crate) pad_40: Pad<0x20>,
+
+        #[ver(V < V13_0B4)]
+        pub(crate) yuv_matrices: Array<0xf, Array<3, Array<4, i16>>>,
+
+        #[ver(V >= V13_0B4)]
+        pub(crate) yuv_matrices: Array<0x3f, Array<3, Array<4, i16>>>,
+
+        pub(crate) pad_1c8: Pad<0x8>,
+        pub(crate) io_mappings: Array<0x14, IOMapping>,
+
+        #[ver(V >= V13_0B4)]
+        pub(crate) unk_450_0: Array<0x68, u8>,
+
+        pub(crate) chip_id: u32,
+        pub(crate) unk_454: u32,
+        pub(crate) unk_458: u32,
+        pub(crate) unk_45c: u32,
+        pub(crate) unk_460: u32,
+        pub(crate) unk_464: u32,
+        pub(crate) unk_468: u32,
+        pub(crate) unk_46c: u32,
+        pub(crate) unk_470: u32,
+        pub(crate) unk_474: u32,
+        pub(crate) unk_478: u32,
+        pub(crate) unk_47c: u32,
+        pub(crate) unk_480: u32,
+        pub(crate) unk_484: u32,
+        pub(crate) unk_488: u32,
+        pub(crate) unk_48c: u32,
+        pub(crate) base_clock_khz: u32,
+        pub(crate) power_sample_period: u32,
+        pub(crate) pad_498: Pad<0x4>,
+        pub(crate) unk_49c: u32,
+        pub(crate) unk_4a0: u32,
+        pub(crate) unk_4a4: u32,
+        pub(crate) pad_4a8: Pad<0x4>,
+        pub(crate) unk_4ac: u32,
+        pub(crate) pad_4b0: Pad<0x8>,
+        pub(crate) unk_4b8: u32,
+        pub(crate) unk_4bc: Array<0x4, u8>,
+        pub(crate) unk_4c0: u32,
+        pub(crate) unk_4c4: u32,
+        pub(crate) unk_4c8: u32,
+        pub(crate) unk_4cc: u32,
+        pub(crate) unk_4d0: u32,
+        pub(crate) unk_4d4: u32,
+        pub(crate) unk_4d8: Array<0x4, u8>,
+        pub(crate) unk_4dc: u32,
+        pub(crate) unk_4e0: U64,
+        pub(crate) unk_4e8: u32,
+        pub(crate) unk_4ec: u32,
+        pub(crate) unk_4f0: u32,
+        pub(crate) unk_4f4: u32,
+        pub(crate) unk_4f8: u32,
+        pub(crate) unk_4fc: u32,
+        pub(crate) unk_500: u32,
+
+        #[ver(V >= V13_0B4)]
+        pub(crate) unk_504_0: u32,
+
+        pub(crate) unk_504: u32,
+        pub(crate) unk_508: u32,
+        pub(crate) unk_50c: u32,
+        pub(crate) unk_510: u32,
+        pub(crate) unk_514: u32,
+        pub(crate) unk_518: u32,
+        pub(crate) unk_51c: u32,
+        pub(crate) unk_520: u32,
+        pub(crate) unk_524: u32,
+        pub(crate) unk_528: u32,
+        pub(crate) unk_52c: u32,
+        pub(crate) unk_530: u32,
+        pub(crate) unk_534: u32,
+        pub(crate) unk_538: u32,
+
+        #[ver(V >= V13_0B4)]
+        pub(crate) unk_53c_0: u32,
+
+        pub(crate) num_frags: u32,
+        pub(crate) unk_540: u32,
+        pub(crate) unk_544: u32,
+        pub(crate) unk_548: u32,
+        pub(crate) unk_54c: u32,
+        pub(crate) unk_550: u32,
+        pub(crate) unk_554: u32,
+        pub(crate) uat_ttb_base: U64,
+        pub(crate) gpu_core_id: u32,
+        pub(crate) gpu_rev_id: u32,
+        pub(crate) num_cores: u32,
+        pub(crate) max_pstate: u32,
+
+        #[ver(V < V13_0B4)]
+        pub(crate) num_pstates: u32,
+
+        pub(crate) frequencies: Array<0x10, u32>,
+        pub(crate) voltages: Array<0x10, [u32; 0x8]>,
+        pub(crate) voltages_sram: Array<0x10, [u32; 0x8]>,
+        pub(crate) sram_k: Array<0x10, F32>,
+        pub(crate) unk_9f4: Array<0x10, u32>,
+        pub(crate) rel_max_powers: Array<0x10, u32>,
+        pub(crate) rel_boost_freqs: Array<0x10, u32>,
+
+        #[ver(V < V13_0B4)]
+        pub(crate) min_sram_volt: u32,
+
+        #[ver(V < V13_0B4)]
+        pub(crate) unk_ab8: u32,
+
+        #[ver(V < V13_0B4)]
+        pub(crate) unk_abc: u32,
+
+        #[ver(V < V13_0B4)]
+        pub(crate) unk_ac0: u32,
+
+        #[ver(V >= V13_0B4)]
+        pub(crate) unk_ac4_0: Array<0x1f0, u8>,
+
+        pub(crate) pad_ac4: Pad<0x8>,
+        pub(crate) unk_acc: u32,
+        pub(crate) unk_ad0: u32,
+        pub(crate) pad_ad4: Pad<0x10>,
+        pub(crate) unk_ae4: Array<0x4, u32>,
+        pub(crate) pad_af4: Pad<0x4>,
+        pub(crate) unk_af8: u32,
+        pub(crate) pad_afc: Pad<0x8>,
+        pub(crate) unk_b04: u32,
+        pub(crate) unk_b08: u32,
+        pub(crate) unk_b0c: u32,
+        pub(crate) unk_b10: u32,
+        pub(crate) pad_b14: Pad<0x8>,
+        pub(crate) unk_b1c: u32,
+        pub(crate) unk_b20: u32,
+        pub(crate) unk_b24: u32,
+        pub(crate) unk_b28: u32,
+        pub(crate) unk_b2c: u32,
+        pub(crate) unk_b30: u32,
+        pub(crate) unk_b34: u32,
+
+        #[ver(V >= V13_0B4)]
+        pub(crate) unk_b38_0: u32,
+
+        #[ver(V >= V13_0B4)]
+        pub(crate) unk_b38_4: u32,
+
+        pub(crate) unk_b38: Array<0xc, u32>,
+        pub(crate) unk_b68: u32,
+
+        #[ver(V >= V13_0B4)]
+        pub(crate) unk_b6c: Array<0xd0, u8>,
+
+        #[ver(V >= V13_0B4)]
+        pub(crate) unk_c3c: u32,
+    }
+    #[versions(AGX)]
+    default_zeroed!(HwDataB::ver);
+
+    #[derive(Debug, Clone, Copy)]
+    #[repr(C, packed)]
+    pub(crate) struct GpuQueueStatsVtx {
+        pub(crate) busy: u32,
+        pub(crate) unk_4: u32,
+        pub(crate) cur_cmdqueue: U64,
+        pub(crate) cur_count: u32,
+        pub(crate) unk_14: u32,
+    }
+    default_zeroed!(GpuQueueStatsVtx);
+
+    #[versions(AGX)]
+    #[derive(Debug, Default, Clone, Copy)]
+    #[repr(C, packed)]
+    pub(crate) struct GpuStatsVtx {
+        pub(crate) unk_4: u32,
+        pub(crate) queues: Array<0x4, GpuQueueStatsVtx>,
+        pub(crate) unk_68: Array<0x8, u8>,
+        pub(crate) unk_70: u32,
+        pub(crate) unk_74: u32,
+        pub(crate) unk_timestamp: U64,
+        pub(crate) unk_80: Array<0x40, u8>,
+    }
+
+    #[derive(Debug, Default, Clone, Copy)]
+    #[repr(C, packed)]
+    pub(crate) struct GpuQueueStatsFrag {
+        pub(crate) busy: u32,
+        pub(crate) cur_cmdqueue: U64,
+        pub(crate) unk_c: u32,
+        pub(crate) unk_10: u32,
+        pub(crate) unk_14: Array<0x14, u8>,
+    }
+
+    #[versions(AGX)]
+    #[derive(Debug)]
+    #[repr(C)]
+    pub(crate) struct GpuStatsFrag {
+        pub(crate) unk_0: Array<0x18, u8>,
+        pub(crate) queues: Array<0x4, GpuQueueStatsFrag>,
+        pub(crate) unk_d0: Array<0x38, u8>,
+        pub(crate) tvb_overflows_1: u32,
+        pub(crate) tvb_overflows_2: u32,
+        pub(crate) unk_f8: u32,
+        pub(crate) unk_fc: u32,
+        pub(crate) cur_stamp_id: i32,
+        pub(crate) unk_104: Array<0x14, u8>,
+        pub(crate) unk_118: i32,
+        pub(crate) unk_11c: u32,
+        pub(crate) unk_120: u32,
+        pub(crate) unk_124: u32,
+        pub(crate) unk_128: u32,
+        pub(crate) unk_12c: u32,
+        pub(crate) unk_timestamp: U64,
+        pub(crate) unk_134: Array<0x8c, u8>,
+    }
+    #[versions(AGX)]
+    default_zeroed!(GpuStatsFrag::ver);
+
+    #[versions(AGX)]
+    #[derive(Debug)]
+    #[repr(C)]
+    pub(crate) struct GpuGlobalStatsVtx {
+        pub(crate) total_cmds: u32,
+        pub(crate) stats: GpuStatsVtx::ver,
+        #[ver(V >= V13_0B4)]
+        pub(crate) unk_pad: Array<0x5c4, u8>,
+    }
+    #[versions(AGX)]
+    default_zeroed!(GpuGlobalStatsVtx::ver);
+
+    #[versions(AGX)]
+    #[derive(Debug)]
+    #[repr(C)]
+    pub(crate) struct GpuGlobalStatsFrag {
+        pub(crate) total_cmds: u32,
+        pub(crate) unk_4: u32,
+        pub(crate) stats: GpuStatsFrag::ver,
+        #[ver(V >= V13_0B4)]
+        pub(crate) unk_pad: Array<0x580, u8>,
+    }
+    #[versions(AGX)]
+    default_zeroed!(GpuGlobalStatsFrag::ver);
+
+    #[derive(Debug)]
+    #[repr(C)]
+    pub(crate) struct GpuStatsComp {
+        pub(crate) unk: Array<0x140, u8>,
+    }
+    default_zeroed!(GpuStatsComp);
+
+    #[derive(Debug)]
+    #[repr(C)]
+    pub(crate) struct RuntimeScratch {
+        pub(crate) unk_280: Array<0x6800, u8>,
+        pub(crate) unk_6a80: u32,
+        pub(crate) gpu_idle: u32,
+        pub(crate) unkpad_6a88: Pad<0x14>,
+        pub(crate) unk_6a9c: u32,
+        pub(crate) unk_ctr0: u32,
+        pub(crate) unk_ctr1: u32,
+        pub(crate) unk_6aa8: u32,
+        pub(crate) unk_6aac: u32,
+        pub(crate) unk_ctr2: u32,
+        pub(crate) unk_6ab4: u32,
+        pub(crate) unk_6ab8: u32,
+        pub(crate) unk_6abc: u32,
+        pub(crate) unk_6ac0: u32,
+        pub(crate) unk_6ac4: u32,
+        pub(crate) unk_ctr3: u32,
+        pub(crate) unk_6acc: u32,
+        pub(crate) unk_6ad0: u32,
+        pub(crate) unk_6ad4: u32,
+        pub(crate) unk_6ad8: u32,
+        pub(crate) unk_6adc: u32,
+        pub(crate) unk_6ae0: u32,
+        pub(crate) unk_6ae4: u32,
+        pub(crate) unk_6ae8: u32,
+        pub(crate) unk_6aec: u32,
+        pub(crate) unk_6af0: u32,
+        pub(crate) unk_ctr4: u32,
+        pub(crate) unk_ctr5: u32,
+        pub(crate) unk_6afc: u32,
+        pub(crate) pad_6b00: Pad<0x38>,
+        pub(crate) unk_6b38: u32,
+        pub(crate) pad_6b3c: Pad<0x84>,
+    }
+    default_zeroed!(RuntimeScratch);
+
+    pub(crate) type BufferMgrCtl = Array<4, u32>;
+
+    #[versions(AGX)]
+    #[repr(C)]
+    pub(crate) struct RuntimePointers<'a> {
+        pub(crate) pipes: Array<4, PipeChannels::ver>,
+
+        pub(crate) device_control:
+            ChannelRing<channels::ChannelState, channels::DeviceControlMsg::ver>,
+        pub(crate) event: ChannelRing<channels::ChannelState, channels::RawEventMsg>,
+        pub(crate) fw_log: ChannelRing<channels::FwLogChannelState, channels::RawFwLogMsg>,
+        pub(crate) ktrace: ChannelRing<channels::ChannelState, channels::RawKTraceMsg>,
+        pub(crate) stats: ChannelRing<channels::ChannelState, channels::RawStatsMsg::ver>,
+
+        pub(crate) __pad0: Pad<0x50>,
+        pub(crate) unk_160: U64,
+        pub(crate) unk_168: U64,
+        pub(crate) stats_vtx: GpuPointer<'a, super::GpuGlobalStatsVtx::ver>,
+        pub(crate) stats_frag: GpuPointer<'a, super::GpuGlobalStatsFrag::ver>,
+        pub(crate) stats_comp: GpuPointer<'a, super::GpuStatsComp>,
+        pub(crate) hwdata_a: GpuPointer<'a, super::HwDataA::ver>,
+        pub(crate) unkptr_190: GpuPointer<'a, &'a [u8]>,
+        pub(crate) unkptr_198: GpuPointer<'a, &'a [u8]>,
+        pub(crate) hwdata_b: GpuPointer<'a, super::HwDataB::ver>,
+        pub(crate) hwdata_b_2: GpuPointer<'a, super::HwDataB::ver>,
+        pub(crate) fwlog_buf: Option<GpuWeakPointer<[channels::RawFwLogPayloadMsg]>>,
+        pub(crate) unkptr_1b8: GpuPointer<'a, &'a [u8]>,
+        pub(crate) unkptr_1c0: GpuPointer<'a, &'a [u8]>,
+        pub(crate) unkptr_1c8: GpuPointer<'a, &'a [u8]>,
+        pub(crate) unk_1d0: u32,
+        pub(crate) unk_1d4: u32,
+        pub(crate) unk_1d8: Array<0x3c, u8>,
+        pub(crate) buffer_mgr_ctl: GpuPointer<'a, &'a [BufferMgrCtl]>,
+        pub(crate) buffer_mgr_ctl_2: GpuPointer<'a, &'a [BufferMgrCtl]>,
+        pub(crate) __pad1: Pad<0x5c>,
+        pub(crate) gpu_scratch: RuntimeScratch,
+    }
+    #[versions(AGX)]
+    no_debug!(RuntimePointers::ver<'_>);
+
+    #[derive(Debug)]
+    #[repr(C)]
+    pub(crate) struct PendingStamp {
+        pub(crate) info: AtomicU32,
+        pub(crate) wait_value: AtomicU32,
+    }
+    default_zeroed!(PendingStamp);
+
+    #[derive(Debug, Clone, Copy)]
+    #[repr(C, packed)]
+    pub(crate) struct FaultInfo {
+        pub(crate) unk_0: u32,
+        pub(crate) unk_4: u32,
+        pub(crate) queue_uuid: u32,
+        pub(crate) unk_c: u32,
+        pub(crate) unk_10: u32,
+        pub(crate) unk_14: u32,
+    }
+    default_zeroed!(FaultInfo);
+
+    #[versions(AGX)]
+    #[derive(Debug, Clone, Copy)]
+    #[repr(C, packed)]
+    pub(crate) struct GlobalsSub {
+        pub(crate) unk_54: u16,
+        pub(crate) unk_56: u16,
+        pub(crate) unk_58: u16,
+        pub(crate) unk_5a: U32,
+        pub(crate) unk_5e: U32,
+        pub(crate) unk_62: U32,
+
+        #[ver(V >= V13_0B4)]
+        pub(crate) unk_66_0: Array<0xc, u8>,
+
+        pub(crate) unk_66: U32,
+        pub(crate) unk_6a: Array<0x16, u8>,
+    }
+    #[versions(AGX)]
+    default_zeroed!(GlobalsSub::ver);
+
+    #[derive(Debug, Clone, Copy)]
+    #[repr(C)]
+    pub(crate) struct PowerZoneGlobal {
+        pub(crate) target: u32,
+        pub(crate) target_off: u32,
+        pub(crate) filter_tc: u32,
+    }
+    default_zeroed!(PowerZoneGlobal);
+
+    #[versions(AGX)]
+    #[derive(Debug)]
+    #[repr(C)]
+    pub(crate) struct Globals {
+        pub(crate) ktrace_enable: u32,
+        pub(crate) unk_4: Array<0x20, u8>,
+
+        #[ver(V >= V13_2)]
+        pub(crate) unk_24_0: u32,
+
+        pub(crate) unk_24: u32,
+
+        #[ver(V >= V13_0B4)]
+        pub(crate) unk_28_0: u32,
+
+        pub(crate) unk_28: u32,
+
+        #[ver(V >= V13_0B4)]
+        pub(crate) unk_2c_0: u32,
+
+        pub(crate) unk_2c: u32,
+        pub(crate) unk_30: u32,
+        pub(crate) unk_34: u32,
+        pub(crate) unk_38: Array<0x1c, u8>,
+
+        pub(crate) sub: GlobalsSub::ver,
+
+        pub(crate) unk_80: Array<0xf80, u8>,
+        pub(crate) unk_1000: Array<0x7000, u8>,
+        pub(crate) unk_8000: Array<0x900, u8>,
+
+        #[ver(V >= V13_0B4 && V < V13_2)]
+        pub(crate) unk_8900_0: u32,
+
+        pub(crate) unk_8900: u32,
+        pub(crate) pending_submissions: AtomicU32,
+        pub(crate) max_power: u32,
+        pub(crate) max_pstate_scaled: u32,
+        pub(crate) max_pstate_scaled_2: u32,
+        pub(crate) unk_8914: u32,
+        pub(crate) unk_8918: u32,
+        pub(crate) max_pstate_scaled_3: u32,
+        pub(crate) unk_8920: u32,
+        pub(crate) power_zone_count: u32,
+        pub(crate) avg_power_filter_tc_periods: u32,
+        pub(crate) avg_power_ki_dt: F32,
+        pub(crate) avg_power_kp: F32,
+        pub(crate) avg_power_min_duty_cycle: u32,
+        pub(crate) avg_power_target_filter_tc: u32,
+        pub(crate) power_zones: Array<5, PowerZoneGlobal>,
+        pub(crate) unk_8978: Array<0x44, u8>,
+
+        #[ver(V >= V13_0B4)]
+        pub(crate) unk_89bc_0: Array<0x3c, u8>,
+
+        pub(crate) unk_89bc: u32,
+        pub(crate) fast_die0_release_temp: u32,
+        pub(crate) unk_89c4: i32,
+        pub(crate) fast_die0_prop_tgt_delta: u32,
+        pub(crate) fast_die0_kp: F32,
+        pub(crate) fast_die0_ki_dt: F32,
+        pub(crate) unk_89d4: Array<0xc, u8>,
+        pub(crate) unk_89e0: u32,
+        pub(crate) max_power_2: u32,
+        pub(crate) ppm_kp: F32,
+        pub(crate) ppm_ki_dt: F32,
+        pub(crate) unk_89f0: u32,
+
+        #[ver(V >= V13_0B4)]
+        pub(crate) unk_89f4_0: Array<0x8, u8>,
+
+        #[ver(V >= V13_0B4)]
+        pub(crate) unk_89f4_8: u32,
+
+        #[ver(V >= V13_0B4)]
+        pub(crate) unk_89f4_c: Array<0x50, u8>,
+
+        pub(crate) unk_89f4: u32,
+        pub(crate) hws1: HwDataShared1,
+        pub(crate) hws2: HwDataShared2,
+
+        #[ver(V >= V13_0B4)]
+        pub(crate) unk_hws2_0: Array<0x28, u8>,
+
+        pub(crate) hws3: HwDataShared3,
+        pub(crate) unk_9004: Array<8, u8>,
+        pub(crate) unk_900c: u32,
+
+        #[ver(V >= V13_0B4)]
+        pub(crate) unk_9010_0: u32,
+
+        #[ver(V >= V13_0B4)]
+        pub(crate) unk_9010_4: Array<0x14, u8>,
+
+        pub(crate) unk_9010: Array<0x2c, u8>,
+        pub(crate) unk_903c: u32,
+        pub(crate) unk_9040: Array<0xc0, u8>,
+        pub(crate) unk_9100: Array<0x6f00, u8>,
+        pub(crate) unk_10000: Array<0xe50, u8>,
+        pub(crate) unk_10e50: u32,
+        pub(crate) unk_10e54: Array<0x2c, u8>,
+        pub(crate) fault_control: u32,
+        pub(crate) do_init: u32,
+        pub(crate) unk_10e88: Array<0x188, u8>,
+        pub(crate) idle_ts: U64,
+        pub(crate) idle_unk: U64,
+        pub(crate) unk_11020: u32,
+        pub(crate) unk_11024: u32,
+        pub(crate) unk_11028: u32,
+
+        #[ver(V >= V13_0B4)]
+        pub(crate) unk_1102c_0: u32,
+
+        #[ver(V >= V13_0B4)]
+        pub(crate) unk_1102c_4: u32,
+
+        #[ver(V >= V13_0B4)]
+        pub(crate) unk_1102c_8: u32,
+
+        #[ver(V >= V13_0B4)]
+        pub(crate) unk_1102c_c: u32,
+
+        #[ver(V >= V13_0B4)]
+        pub(crate) unk_1102c_10: u32,
+
+        pub(crate) unk_1102c: u32,
+        pub(crate) idle_off_delay_ms: AtomicU32,
+        pub(crate) fender_idle_off_delay_ms: u32,
+        pub(crate) fw_early_wake_timeout_ms: u32,
+        pub(crate) pending_stamps: Array<0x110, PendingStamp>,
+        pub(crate) unk_117bc: u32,
+        pub(crate) fault_info: FaultInfo,
+        pub(crate) counter: u32,
+        pub(crate) unk_118dc: u32,
+
+        #[ver(V >= V13_0B4)]
+        pub(crate) unk_118e0_0: Array<0x9c, u8>,
+
+        pub(crate) unk_118e0: u32,
+
+        #[ver(V >= V13_0B4)]
+        pub(crate) unk_118e4_0: u32,
+
+        pub(crate) unk_118e4: u32,
+        pub(crate) unk_118e8: u32,
+        pub(crate) unk_118ec: Array<0x15, u8>,
+        pub(crate) unk_11901: Array<0x43f, u8>,
+
+        #[ver(V >= V13_0B4)]
+        pub(crate) unk_11d40: Array<0x19c, u8>,
+
+        #[ver(V >= V13_0B4)]
+        pub(crate) unk_11edc: u32,
+
+        #[ver(V >= V13_0B4)]
+        pub(crate) unk_11ee0: Array<0x1c, u8>,
+
+        #[ver(V >= V13_0B4)]
+        pub(crate) unk_11efc: u32,
+    }
+    #[versions(AGX)]
+    default_zeroed!(Globals::ver);
+
+    #[derive(Debug, Default, Clone, Copy)]
+    #[repr(C, packed)]
+    pub(crate) struct UatLevelInfo {
+        pub(crate) unk_3: u8,
+        pub(crate) unk_1: u8,
+        pub(crate) unk_2: u8,
+        pub(crate) index_shift: u8,
+        pub(crate) num_entries: u16,
+        pub(crate) unk_4: u16,
+        pub(crate) unk_8: U64,
+        pub(crate) unk_10: U64,
+        pub(crate) index_mask: U64,
+    }
+
+    #[versions(AGX)]
+    #[derive(Debug)]
+    #[repr(C)]
+    pub(crate) struct InitData<'a> {
+        #[ver(V >= V13_0B4)]
+        pub(crate) ver_info: Array<0x4, u16>,
+
+        pub(crate) unk_buf: GpuPointer<'a, &'a [u8]>,
+        pub(crate) unk_8: u32,
+        pub(crate) unk_c: u32,
+        pub(crate) runtime_pointers: GpuPointer<'a, super::RuntimePointers::ver>,
+        pub(crate) globals: GpuPointer<'a, super::Globals::ver>,
+        pub(crate) fw_status: GpuPointer<'a, super::FwStatus>,
+        pub(crate) uat_page_size: u16,
+        pub(crate) uat_page_bits: u8,
+        pub(crate) uat_num_levels: u8,
+        pub(crate) uat_level_info: Array<0x3, UatLevelInfo>,
+        pub(crate) __pad0: Pad<0x14>,
+        pub(crate) host_mapped_fw_allocations: u32,
+        pub(crate) unk_ac: u32,
+        pub(crate) unk_b0: u32,
+        pub(crate) unk_b4: u32,
+        pub(crate) unk_b8: u32,
+    }
+}
+
+#[derive(Debug)]
+pub(crate) struct ChannelRing<T: GpuStruct + Debug + Default, U: Copy>
+where
+    for<'a> <T as GpuStruct>::Raw<'a>: Debug,
+{
+    pub(crate) state: GpuObject<T>,
+    pub(crate) ring: GpuArray<U>,
+}
+
+impl<T: GpuStruct + Debug + Default, U: Copy> ChannelRing<T, U>
+where
+    for<'a> <T as GpuStruct>::Raw<'a>: Debug,
+{
+    pub(crate) fn to_raw(&self) -> raw::ChannelRing<T, U> {
+        raw::ChannelRing {
+            state: Some(self.state.weak_pointer()),
+            ring: Some(self.ring.weak_pointer()),
+        }
+    }
+}
+
+trivial_gpustruct!(FwStatus);
+
+#[versions(AGX)]
+#[derive(Debug, Default)]
+pub(crate) struct GpuGlobalStatsVtx {}
+
+#[versions(AGX)]
+impl GpuStruct for GpuGlobalStatsVtx::ver {
+    type Raw<'a> = raw::GpuGlobalStatsVtx::ver;
+}
+
+#[versions(AGX)]
+#[derive(Debug, Default)]
+pub(crate) struct GpuGlobalStatsFrag {}
+
+#[versions(AGX)]
+impl GpuStruct for GpuGlobalStatsFrag::ver {
+    type Raw<'a> = raw::GpuGlobalStatsFrag::ver;
+}
+
+#[derive(Debug, Default)]
+pub(crate) struct GpuStatsComp {}
+
+impl GpuStruct for GpuStatsComp {
+    type Raw<'a> = raw::GpuStatsComp;
+}
+
+#[versions(AGX)]
+#[derive(Debug, Default)]
+pub(crate) struct HwDataA {}
+
+#[versions(AGX)]
+impl GpuStruct for HwDataA::ver {
+    type Raw<'a> = raw::HwDataA::ver;
+}
+
+#[versions(AGX)]
+#[derive(Debug, Default)]
+pub(crate) struct HwDataB {}
+
+#[versions(AGX)]
+impl GpuStruct for HwDataB::ver {
+    type Raw<'a> = raw::HwDataB::ver;
+}
+
+#[versions(AGX)]
+#[derive(Debug)]
+pub(crate) struct Stats {
+    pub(crate) vtx: GpuObject<GpuGlobalStatsVtx::ver>,
+    pub(crate) frag: GpuObject<GpuGlobalStatsFrag::ver>,
+    pub(crate) comp: GpuObject<GpuStatsComp>,
+}
+
+#[versions(AGX)]
+#[derive(Debug)]
+pub(crate) struct RuntimePointers {
+    pub(crate) stats: Stats::ver,
+
+    pub(crate) hwdata_a: GpuObject<HwDataA::ver>,
+    pub(crate) unkptr_190: GpuArray<u8>,
+    pub(crate) unkptr_198: GpuArray<u8>,
+    pub(crate) hwdata_b: GpuObject<HwDataB::ver>,
+
+    pub(crate) unkptr_1b8: GpuArray<u8>,
+    pub(crate) unkptr_1c0: GpuArray<u8>,
+    pub(crate) unkptr_1c8: GpuArray<u8>,
+
+    pub(crate) buffer_mgr_ctl: GpuArray<raw::BufferMgrCtl>,
+}
+
+#[versions(AGX)]
+impl GpuStruct for RuntimePointers::ver {
+    type Raw<'a> = raw::RuntimePointers::ver<'a>;
+}
+
+#[versions(AGX)]
+#[derive(Debug, Default)]
+pub(crate) struct Globals {}
+
+#[versions(AGX)]
+impl GpuStruct for Globals::ver {
+    type Raw<'a> = raw::Globals::ver;
+}
+
+#[versions(AGX)]
+#[derive(Debug)]
+pub(crate) struct InitData {
+    pub(crate) unk_buf: GpuArray<u8>,
+    pub(crate) runtime_pointers: GpuObject<RuntimePointers::ver>,
+    pub(crate) globals: GpuObject<Globals::ver>,
+    pub(crate) fw_status: GpuObject<FwStatus>,
+}
+
+#[versions(AGX)]
+impl GpuStruct for InitData::ver {
+    type Raw<'a> = raw::InitData::ver<'a>;
+}
diff --git a/drivers/gpu/drm/asahi/fw/job.rs b/drivers/gpu/drm/asahi/fw/job.rs
new file mode 100644
index 000000000000..a0bbf67b1b1d
--- /dev/null
+++ b/drivers/gpu/drm/asahi/fw/job.rs
@@ -0,0 +1,56 @@
+// SPDX-License-Identifier: GPL-2.0-only OR MIT
+
+//! Common GPU job firmware structures
+
+use super::types::*;
+use crate::{default_zeroed, trivial_gpustruct};
+
+pub(crate) mod raw {
+    use super::*;
+
+    #[derive(Debug, Clone, Copy)]
+    #[repr(C)]
+    pub(crate) struct JobMeta {
+        pub(crate) unk_4: u32,
+        pub(crate) stamp: GpuWeakPointer<Stamp>,
+        pub(crate) fw_stamp: GpuWeakPointer<FwStamp>,
+        pub(crate) stamp_value: EventValue,
+        pub(crate) stamp_slot: u32,
+        pub(crate) evctl_index: u32,
+        pub(crate) flush_stamps: u32,
+        pub(crate) uuid: u32,
+        pub(crate) cmd_seq: u32,
+    }
+
+    #[derive(Debug)]
+    #[repr(C)]
+    pub(crate) struct EncoderParams<'a> {
+        pub(crate) unk_8: u32,
+        pub(crate) unk_c: u32,
+        pub(crate) unk_10: u32,
+        pub(crate) encoder_id: u32,
+        pub(crate) unk_18: u32,
+        pub(crate) iogpu_compute_unk44: u32,
+        pub(crate) seq_buffer: GpuPointer<'a, &'a [u64]>,
+        pub(crate) unk_28: U64,
+    }
+
+    #[derive(Debug)]
+    #[repr(C)]
+    pub(crate) struct JobTimestamps {
+        pub(crate) start: AtomicU64,
+        pub(crate) end: AtomicU64,
+    }
+    default_zeroed!(JobTimestamps);
+
+    #[derive(Debug)]
+    #[repr(C)]
+    pub(crate) struct RenderTimestamps {
+        pub(crate) vtx: JobTimestamps,
+        pub(crate) frag: JobTimestamps,
+    }
+    default_zeroed!(RenderTimestamps);
+}
+
+trivial_gpustruct!(JobTimestamps);
+trivial_gpustruct!(RenderTimestamps);
diff --git a/drivers/gpu/drm/asahi/fw/microseq.rs b/drivers/gpu/drm/asahi/fw/microseq.rs
new file mode 100644
index 000000000000..8deea3fb9914
--- /dev/null
+++ b/drivers/gpu/drm/asahi/fw/microseq.rs
@@ -0,0 +1,384 @@
+// SPDX-License-Identifier: GPL-2.0-only OR MIT
+
+//! GPU firmware microsequence operations
+
+use super::types::*;
+use super::{buffer, compute, fragment, initdata, vertex, workqueue};
+use crate::default_zeroed;
+
+pub(crate) trait Operation {}
+
+#[derive(Debug, Copy, Clone)]
+#[repr(u32)]
+enum OpCode {
+    WaitForIdle = 0x01,
+    RetireStamp = 0x18,
+    #[allow(dead_code)]
+    Timestamp = 0x19,
+    StartVertex = 0x22,
+    FinalizeVertex = 0x23,
+    StartFragment = 0x24,
+    FinalizeFragment = 0x25,
+    StartCompute = 0x29,
+    FinalizeCompute = 0x2a,
+}
+
+#[derive(Debug, Copy, Clone)]
+#[repr(u32)]
+pub(crate) enum Pipe {
+    Vertex = 1 << 0,
+    Fragment = 1 << 8,
+    Compute = 1 << 15,
+}
+
+pub(crate) const MAX_ATTACHMENTS: usize = 16;
+
+#[derive(Debug, Clone, Copy)]
+#[repr(C)]
+pub(crate) struct Attachment {
+    pub(crate) address: U64,
+    pub(crate) size: u32,
+    pub(crate) unk_c: u16,
+    pub(crate) unk_e: u16,
+}
+default_zeroed!(Attachment);
+
+#[derive(Debug, Clone, Copy, Default)]
+#[repr(C)]
+pub(crate) struct Attachments {
+    pub(crate) list: Array<MAX_ATTACHMENTS, Attachment>,
+    pub(crate) count: u32,
+}
+
+#[derive(Debug, Copy, Clone)]
+#[repr(transparent)]
+pub(crate) struct OpHeader(u32);
+
+impl OpHeader {
+    const fn new(opcode: OpCode) -> OpHeader {
+        OpHeader(opcode as u32)
+    }
+    const fn with_args(opcode: OpCode, args: u32) -> OpHeader {
+        OpHeader(opcode as u32 | args)
+    }
+}
+
+macro_rules! simple_op {
+    ($name:ident) => {
+        #[derive(Debug, Copy, Clone)]
+        pub(crate) struct $name(OpHeader);
+
+        impl $name {
+            pub(crate) const HEADER: $name = $name(OpHeader::new(OpCode::$name));
+        }
+    };
+}
+
+pub(crate) mod op {
+    use super::*;
+
+    simple_op!(StartVertex);
+    simple_op!(FinalizeVertex);
+    simple_op!(StartFragment);
+    simple_op!(FinalizeFragment);
+    simple_op!(StartCompute);
+    simple_op!(FinalizeCompute);
+
+    #[derive(Debug, Copy, Clone)]
+    pub(crate) struct RetireStamp(OpHeader);
+    impl RetireStamp {
+        pub(crate) const HEADER: RetireStamp =
+            RetireStamp(OpHeader::with_args(OpCode::RetireStamp, 0x40000000));
+    }
+
+    #[derive(Debug, Copy, Clone)]
+    pub(crate) struct WaitForIdle(OpHeader);
+    impl WaitForIdle {
+        pub(crate) const fn new(pipe: Pipe) -> WaitForIdle {
+            WaitForIdle(OpHeader::with_args(OpCode::WaitForIdle, (pipe as u32) << 8))
+        }
+    }
+
+    #[derive(Debug, Copy, Clone)]
+    pub(crate) struct Timestamp(OpHeader);
+    impl Timestamp {
+        #[allow(dead_code)]
+        pub(crate) const fn new(flag: bool) -> Timestamp {
+            Timestamp(OpHeader::with_args(OpCode::Timestamp, (flag as u32) << 31))
+        }
+    }
+}
+
+#[derive(Debug)]
+#[repr(C)]
+pub(crate) struct WaitForIdle {
+    pub(crate) header: op::WaitForIdle,
+}
+
+impl Operation for WaitForIdle {}
+
+#[derive(Debug)]
+#[repr(C)]
+pub(crate) struct RetireStamp {
+    pub(crate) header: op::RetireStamp,
+}
+
+impl Operation for RetireStamp {}
+
+#[versions(AGX)]
+#[derive(Debug)]
+#[repr(C)]
+pub(crate) struct Timestamp<'a> {
+    pub(crate) header: op::Timestamp,
+    pub(crate) cur_ts: GpuWeakPointer<U64>,
+    pub(crate) start_ts: GpuWeakPointer<Option<GpuPointer<'a, AtomicU64>>>,
+    pub(crate) update_ts: GpuWeakPointer<Option<GpuPointer<'a, AtomicU64>>>,
+    pub(crate) work_queue: GpuWeakPointer<workqueue::QueueInfo::ver>,
+    pub(crate) unk_24: U64,
+
+    #[ver(V >= V13_0B4)]
+    pub(crate) unk_ts: GpuWeakPointer<U64>,
+
+    pub(crate) uuid: u32,
+    pub(crate) unk_30_padding: u32,
+}
+
+#[versions(AGX)]
+impl<'a> Operation for Timestamp::ver<'a> {}
+
+#[versions(AGX)]
+#[derive(Debug)]
+#[repr(C)]
+pub(crate) struct StartVertex<'a> {
+    pub(crate) header: op::StartVertex,
+    pub(crate) tiling_params: GpuWeakPointer<vertex::raw::TilingParameters>,
+    pub(crate) job_params1: GpuWeakPointer<vertex::raw::JobParameters1::ver<'a>>,
+    pub(crate) buffer: GpuWeakPointer<buffer::Info::ver>,
+    pub(crate) scene: GpuWeakPointer<buffer::Scene::ver>,
+    pub(crate) stats: GpuWeakPointer<initdata::raw::GpuStatsVtx::ver>,
+    pub(crate) work_queue: GpuWeakPointer<workqueue::QueueInfo::ver>,
+    pub(crate) vm_slot: u32,
+    pub(crate) unk_38: u32,
+    pub(crate) event_generation: u32,
+    pub(crate) buffer_slot: u32,
+    pub(crate) unk_44: u32,
+    pub(crate) cmd_seq: U64,
+    pub(crate) unk_50: u32,
+    pub(crate) unk_pointer: GpuWeakPointer<u32>,
+    pub(crate) unk_job_buf: GpuWeakPointer<U64>,
+    pub(crate) unk_64: u32,
+    pub(crate) unk_68: u32,
+    pub(crate) uuid: u32,
+    pub(crate) unk_70: u32,
+    pub(crate) unk_74: Array<0x1d, U64>,
+    pub(crate) unk_15c: u32,
+    pub(crate) unk_160: U64,
+    pub(crate) unk_168: u32,
+    pub(crate) unk_16c: u32,
+    pub(crate) unk_170: U64,
+
+    #[ver(V >= V13_0B4)]
+    pub(crate) counter: U64,
+
+    #[ver(V >= V13_0B4)]
+    pub(crate) notifier_buf: GpuWeakPointer<Array<0x8, u8>>,
+
+    pub(crate) unk_178: u32,
+}
+
+#[versions(AGX)]
+impl<'a> Operation for StartVertex::ver<'a> {}
+
+#[versions(AGX)]
+#[derive(Debug)]
+#[repr(C)]
+pub(crate) struct FinalizeVertex {
+    pub(crate) header: op::FinalizeVertex,
+    pub(crate) scene: GpuWeakPointer<buffer::Scene::ver>,
+    pub(crate) buffer: GpuWeakPointer<buffer::Info::ver>,
+    pub(crate) stats: GpuWeakPointer<initdata::raw::GpuStatsVtx::ver>,
+    pub(crate) work_queue: GpuWeakPointer<workqueue::QueueInfo::ver>,
+    pub(crate) vm_slot: u32,
+    pub(crate) unk_28: u32,
+    pub(crate) unk_pointer: GpuWeakPointer<u32>,
+    pub(crate) unk_34: u32,
+    pub(crate) uuid: u32,
+    pub(crate) fw_stamp: GpuWeakPointer<FwStamp>,
+    pub(crate) stamp_value: EventValue,
+    pub(crate) unk_48: U64,
+    pub(crate) unk_50: u32,
+    pub(crate) unk_54: u32,
+    pub(crate) unk_58: U64,
+    pub(crate) unk_60: u32,
+    pub(crate) unk_64: u32,
+    pub(crate) unk_68: u32,
+
+    #[ver(G >= G14 && V < V13_0B4)]
+    pub(crate) unk_68_g14: U64,
+
+    pub(crate) restart_branch_offset: i32,
+    pub(crate) unk_70: u32,
+
+    #[ver(V >= V13_0B4)]
+    pub(crate) unk_74: Array<0x10, u8>,
+}
+
+#[versions(AGX)]
+impl Operation for FinalizeVertex::ver {}
+
+#[versions(AGX)]
+#[derive(Debug)]
+#[repr(C)]
+pub(crate) struct StartFragment<'a> {
+    pub(crate) header: op::StartFragment,
+    pub(crate) job_params2: GpuWeakPointer<fragment::raw::JobParameters2>,
+    pub(crate) job_params1: GpuWeakPointer<fragment::raw::JobParameters1::ver<'a>>,
+    pub(crate) scene: GpuPointer<'a, buffer::Scene::ver>,
+    pub(crate) stats: GpuWeakPointer<initdata::raw::GpuStatsFrag::ver>,
+    pub(crate) busy_flag: GpuWeakPointer<u32>,
+    pub(crate) tvb_overflow_count: GpuWeakPointer<u32>,
+    pub(crate) unk_pointer: GpuWeakPointer<u32>,
+    pub(crate) work_queue: GpuWeakPointer<workqueue::QueueInfo::ver>,
+    pub(crate) work_item: GpuWeakPointer<fragment::RunFragment::ver>,
+    pub(crate) vm_slot: u32,
+    pub(crate) unk_50: u32,
+    pub(crate) event_generation: u32,
+    pub(crate) buffer_slot: u32,
+    pub(crate) unk_5c: u32,
+    pub(crate) cmd_seq: U64,
+    pub(crate) unk_68: u32,
+    pub(crate) unk_758_flag: GpuWeakPointer<u32>,
+    pub(crate) unk_job_buf: GpuWeakPointer<U64>,
+    pub(crate) unk_7c: u32,
+    pub(crate) unk_80: u32,
+    pub(crate) unk_84: u32,
+    pub(crate) uuid: u32,
+    pub(crate) attachments: Attachments,
+    pub(crate) unk_190: u32,
+
+    #[ver(V >= V13_0B4)]
+    pub(crate) counter: U64,
+
+    #[ver(V >= V13_0B4)]
+    pub(crate) notifier_buf: GpuWeakPointer<Array<0x8, u8>>,
+}
+
+#[versions(AGX)]
+impl<'a> Operation for StartFragment::ver<'a> {}
+
+#[versions(AGX)]
+#[derive(Debug)]
+#[repr(C)]
+pub(crate) struct FinalizeFragment {
+    pub(crate) header: op::FinalizeFragment,
+    pub(crate) uuid: u32,
+    pub(crate) unk_8: u32,
+    pub(crate) fw_stamp: GpuWeakPointer<FwStamp>,
+    pub(crate) stamp_value: EventValue,
+    pub(crate) unk_18: u32,
+    pub(crate) scene: GpuWeakPointer<buffer::Scene::ver>,
+    pub(crate) buffer: GpuWeakPointer<buffer::Info::ver>,
+    pub(crate) unk_2c: U64,
+    pub(crate) stats: GpuWeakPointer<initdata::raw::GpuStatsFrag::ver>,
+    pub(crate) unk_pointer: GpuWeakPointer<u32>,
+    pub(crate) busy_flag: GpuWeakPointer<u32>,
+    pub(crate) work_queue: GpuWeakPointer<workqueue::QueueInfo::ver>,
+    pub(crate) work_item: GpuWeakPointer<fragment::RunFragment::ver>,
+    pub(crate) vm_slot: u32,
+    pub(crate) unk_60: u32,
+    pub(crate) unk_758_flag: GpuWeakPointer<u32>,
+    pub(crate) unk_6c: U64,
+    pub(crate) unk_74: U64,
+    pub(crate) unk_7c: U64,
+    pub(crate) unk_84: U64,
+    pub(crate) unk_8c: U64,
+
+    #[ver(G == G14 && V < V13_0B4)]
+    pub(crate) unk_8c_g14: U64,
+
+    pub(crate) restart_branch_offset: i32,
+    pub(crate) unk_98: u32,
+
+    #[ver(V >= V13_0B4)]
+    pub(crate) unk_9c: Array<0x10, u8>,
+}
+
+#[versions(AGX)]
+impl Operation for FinalizeFragment::ver {}
+
+#[versions(AGX)]
+#[derive(Debug)]
+#[repr(C)]
+pub(crate) struct StartCompute<'a> {
+    pub(crate) header: op::StartCompute,
+    pub(crate) unk_pointer: GpuWeakPointer<Array<0x54, u8>>,
+    pub(crate) job_params1: GpuWeakPointer<compute::raw::JobParameters1<'a>>,
+    pub(crate) stats: GpuWeakPointer<initdata::GpuStatsComp>,
+    pub(crate) work_queue: GpuWeakPointer<workqueue::QueueInfo::ver>,
+    pub(crate) vm_slot: u32,
+    pub(crate) unk_28: u32,
+    pub(crate) event_generation: u32,
+    pub(crate) cmd_seq: U64,
+    pub(crate) unk_38: u32,
+    pub(crate) job_params2: GpuWeakPointer<compute::raw::JobParameters2::ver<'a>>,
+    pub(crate) unk_44: u32,
+    pub(crate) uuid: u32,
+    pub(crate) attachments: Attachments,
+    pub(crate) padding: u32,
+
+    #[ver(V >= V13_0B4)]
+    pub(crate) unk_flag: GpuWeakPointer<U32>,
+
+    #[ver(V >= V13_0B4)]
+    pub(crate) counter: U64,
+
+    #[ver(V >= V13_0B4)]
+    pub(crate) notifier_buf: GpuWeakPointer<Array<0x8, u8>>,
+}
+
+#[versions(AGX)]
+impl<'a> Operation for StartCompute::ver<'a> {}
+
+#[versions(AGX)]
+#[derive(Debug)]
+#[repr(C)]
+pub(crate) struct FinalizeCompute<'a> {
+    pub(crate) header: op::FinalizeCompute,
+    pub(crate) stats: GpuWeakPointer<initdata::GpuStatsComp>,
+    pub(crate) work_queue: GpuWeakPointer<workqueue::QueueInfo::ver>,
+    pub(crate) vm_slot: u32,
+    #[ver(V < V13_0B4)]
+    pub(crate) unk_18: u32,
+    pub(crate) job_params2: GpuWeakPointer<compute::raw::JobParameters2::ver<'a>>,
+    pub(crate) unk_24: u32,
+    pub(crate) uuid: u32,
+    pub(crate) fw_stamp: GpuWeakPointer<FwStamp>,
+    pub(crate) stamp_value: EventValue,
+    pub(crate) unk_38: u32,
+    pub(crate) unk_3c: u32,
+    pub(crate) unk_40: u32,
+    pub(crate) unk_44: u32,
+    pub(crate) unk_48: u32,
+    pub(crate) unk_4c: u32,
+    pub(crate) unk_50: u32,
+    pub(crate) unk_54: u32,
+    pub(crate) unk_58: u32,
+
+    #[ver(G == G14 && V < V13_0B4)]
+    pub(crate) unk_5c_g14: U64,
+
+    pub(crate) restart_branch_offset: i32,
+    pub(crate) unk_60: u32,
+
+    #[ver(V >= V13_0B4)]
+    pub(crate) unk_64: Array<0xd, u8>,
+
+    #[ver(V >= V13_0B4)]
+    pub(crate) unk_flag: GpuWeakPointer<U32>,
+
+    #[ver(V >= V13_0B4)]
+    pub(crate) unk_79: Array<0x7, u8>,
+}
+
+#[versions(AGX)]
+impl<'a> Operation for FinalizeCompute::ver<'a> {}
diff --git a/drivers/gpu/drm/asahi/fw/mod.rs b/drivers/gpu/drm/asahi/fw/mod.rs
new file mode 100644
index 000000000000..a5649aa20d3a
--- /dev/null
+++ b/drivers/gpu/drm/asahi/fw/mod.rs
@@ -0,0 +1,15 @@
+// SPDX-License-Identifier: GPL-2.0-only OR MIT
+
+//! Firmware structures for Apple AGX GPUs
+
+pub(crate) mod buffer;
+pub(crate) mod channels;
+pub(crate) mod compute;
+pub(crate) mod event;
+pub(crate) mod fragment;
+pub(crate) mod initdata;
+pub(crate) mod job;
+pub(crate) mod microseq;
+pub(crate) mod types;
+pub(crate) mod vertex;
+pub(crate) mod workqueue;
diff --git a/drivers/gpu/drm/asahi/fw/types.rs b/drivers/gpu/drm/asahi/fw/types.rs
new file mode 100644
index 000000000000..c1a07be1e047
--- /dev/null
+++ b/drivers/gpu/drm/asahi/fw/types.rs
@@ -0,0 +1,233 @@
+// SPDX-License-Identifier: GPL-2.0-only OR MIT
+
+//! Common types for firmware structure definitions
+
+use crate::{alloc, object};
+use core::fmt;
+use core::ops::{Deref, DerefMut, Index, IndexMut};
+
+pub(crate) use crate::event::EventValue;
+pub(crate) use crate::object::{GpuPointer, GpuStruct, GpuWeakPointer};
+pub(crate) use crate::{f32, float::F32};
+
+pub(crate) use ::alloc::boxed::Box;
+pub(crate) use core::fmt::Debug;
+pub(crate) use core::marker::PhantomData;
+pub(crate) use core::sync::atomic::{AtomicI32, AtomicU32, AtomicU64};
+pub(crate) use kernel::macros::versions;
+
+// Make the trait visible
+pub(crate) use crate::alloc::Allocator as _Allocator;
+
+/// General allocator type used for the driver
+pub(crate) type Allocator = alloc::DefaultAllocator;
+
+/// General GpuObject type used for the driver
+pub(crate) type GpuObject<T> =
+    object::GpuObject<T, alloc::GenericAlloc<T, alloc::DefaultAllocation>>;
+
+/// General GpuArray type used for the driver
+pub(crate) type GpuArray<T> = object::GpuArray<T, alloc::GenericAlloc<T, alloc::DefaultAllocation>>;
+
+/// General GpuOnlyArray type used for the driver
+pub(crate) type GpuOnlyArray<T> =
+    object::GpuOnlyArray<T, alloc::GenericAlloc<T, alloc::DefaultAllocation>>;
+
+/// A stamp slot that is shared between firmware and the driver.
+#[derive(Debug, Default)]
+#[repr(transparent)]
+pub(crate) struct Stamp(pub(crate) AtomicU32);
+
+/// A stamp slot that is for private firmware use.
+///
+/// This is a separate type to guard against pointer type confusion.
+#[derive(Debug, Default)]
+#[repr(transparent)]
+pub(crate) struct FwStamp(pub(crate) AtomicU32);
+
+/// An unaligned u64 type.
+///
+/// This is useful to avoid having to pack firmware structures entirely, since that is incompatible
+/// with `#[derive(Debug)]` and atomics.
+#[derive(Copy, Clone, Default)]
+#[repr(C, packed(1))]
+pub(crate) struct U64(pub(crate) u64);
+
+unsafe impl Zeroed for U64 {}
+
+impl fmt::Debug for U64 {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        let v = self.0;
+        f.write_fmt(format_args!("{:#x}", v))
+    }
+}
+
+/// An unaligned u32 type.
+///
+/// This is useful to avoid having to pack firmware structures entirely, since that is incompatible
+/// with `#[derive(Debug)]` and atomics.
+#[derive(Copy, Clone, Default)]
+#[repr(C, packed(1))]
+pub(crate) struct U32(pub(crate) u32);
+
+unsafe impl Zeroed for U32 {}
+
+impl fmt::Debug for U32 {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        let v = self.0;
+        f.write_fmt(format_args!("{:#x}", v))
+    }
+}
+
+unsafe impl Zeroed for u8 {}
+unsafe impl Zeroed for u16 {}
+unsafe impl Zeroed for u32 {}
+unsafe impl Zeroed for u64 {}
+unsafe impl Zeroed for i8 {}
+unsafe impl Zeroed for i16 {}
+unsafe impl Zeroed for i32 {}
+unsafe impl Zeroed for i64 {}
+
+/// Create a dummy `Debug` implementation, for when we need it but it's too painful to write by
+/// hand or not very useful.
+#[macro_export]
+macro_rules! no_debug {
+    ($type:ty) => {
+        impl Debug for $type {
+            fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
+                write!(f, "...")
+            }
+        }
+    };
+}
+
+/// Types which can be safely initialized with an all-zero bit pattern.
+///
+/// See: https://github.com/rust-lang/rfcs/issues/2626
+///
+/// # Safety
+///
+/// This trait must only be implemented if a type only contains primitive types which can be
+/// zero-initialized, FFI structs intended to be zero-initialized, or other types which impl Zeroed.
+pub(crate) unsafe trait Zeroed: Default {
+    fn zeroed() -> Self {
+        // SAFETY: The user is responsible for ensuring this is safe.
+        unsafe { core::mem::zeroed() }
+    }
+}
+
+/// Implement Zeroed for a given type (and Default along with it).
+///
+/// # Safety
+///
+/// This macro must only be used if a type only contains primitive types which can be
+/// zero-initialized, FFI structs intended to be zero-initialized, or other types which impl Zeroed.
+#[macro_export]
+macro_rules! default_zeroed {
+    (<$($lt:lifetime),*>, $type:ty) => {
+        impl<$($lt),*> Default for $type {
+            fn default() -> $type {
+                Zeroed::zeroed()
+            }
+        }
+        // SAFETY: The user is responsible for ensuring this is safe.
+        unsafe impl<$($lt),*> Zeroed for $type {}
+    };
+    ($type:ty) => {
+        impl Default for $type {
+            fn default() -> $type {
+                Zeroed::zeroed()
+            }
+        }
+        // SAFETY: The user is responsible for ensuring this is safe.
+        unsafe impl Zeroed for $type {}
+    };
+}
+
+/// A convenience type for a number of padding bytes. Hidden from Debug formatting.
+#[derive(Copy, Clone)]
+#[repr(C, packed)]
+pub(crate) struct Pad<const N: usize>([u8; N]);
+
+/// SAFETY: Primitive type, safe to zero-init.
+unsafe impl<const N: usize> Zeroed for Pad<N> {}
+
+impl<const N: usize> Default for Pad<N> {
+    fn default() -> Self {
+        Zeroed::zeroed()
+    }
+}
+
+impl<const N: usize> fmt::Debug for Pad<N> {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        f.write_fmt(format_args!("<pad>"))
+    }
+}
+
+/// A convenience type for a fixed-sized array with Default/Zeroed impls.
+#[derive(Copy, Clone)]
+#[repr(C)]
+pub(crate) struct Array<const N: usize, T>([T; N]);
+
+impl<const N: usize, T> Array<N, T> {
+    pub(crate) fn new(data: [T; N]) -> Self {
+        Self(data)
+    }
+}
+
+// SAFETY: Arrays of Zeroed values can be safely Zeroed.
+unsafe impl<const N: usize, T: Zeroed> Zeroed for Array<N, T> {}
+
+impl<const N: usize, T: Zeroed> Default for Array<N, T> {
+    fn default() -> Self {
+        Zeroed::zeroed()
+    }
+}
+
+impl<const N: usize, T> Index<usize> for Array<N, T> {
+    type Output = T;
+
+    fn index(&self, index: usize) -> &Self::Output {
+        &self.0[index]
+    }
+}
+
+impl<const N: usize, T> IndexMut<usize> for Array<N, T> {
+    fn index_mut(&mut self, index: usize) -> &mut Self::Output {
+        &mut self.0[index]
+    }
+}
+
+impl<const N: usize, T> Deref for Array<N, T> {
+    type Target = [T; N];
+
+    fn deref(&self) -> &Self::Target {
+        &self.0
+    }
+}
+
+impl<const N: usize, T> DerefMut for Array<N, T> {
+    fn deref_mut(&mut self) -> &mut Self::Target {
+        &mut self.0
+    }
+}
+
+impl<const N: usize, T: Sized + fmt::Debug> fmt::Debug for Array<N, T> {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        self.0.fmt(f)
+    }
+}
+
+/// Convenience macro to define an identically-named trivial GpuStruct with no inner fields for a
+/// given raw type name.
+#[macro_export]
+macro_rules! trivial_gpustruct {
+    ($type:ident) => {
+        #[derive(Debug, Default)]
+        pub(crate) struct $type {}
+
+        impl GpuStruct for $type {
+            type Raw<'a> = raw::$type;
+        }
+    };
+}
diff --git a/drivers/gpu/drm/asahi/fw/vertex.rs b/drivers/gpu/drm/asahi/fw/vertex.rs
new file mode 100644
index 000000000000..959a0913e693
--- /dev/null
+++ b/drivers/gpu/drm/asahi/fw/vertex.rs
@@ -0,0 +1,177 @@
+// SPDX-License-Identifier: GPL-2.0-only OR MIT
+
+//! GPU vertex job firmware structures
+
+use super::types::*;
+use super::{event, job, workqueue};
+use crate::{buffer, fw, microseq, mmu};
+use kernel::sync::Arc;
+
+pub(crate) mod raw {
+    use super::*;
+
+    #[derive(Debug, Default, Copy, Clone)]
+    #[repr(C)]
+    pub(crate) struct TilingParameters {
+        pub(crate) rgn_size: u32,
+        pub(crate) unk_4: u32,
+        pub(crate) ppp_ctrl: u32,
+        pub(crate) x_max: u16,
+        pub(crate) y_max: u16,
+        pub(crate) te_screen: u32,
+        pub(crate) te_mtile1: u32,
+        pub(crate) te_mtile2: u32,
+        pub(crate) tiles_per_mtile: u32,
+        pub(crate) tpc_stride: u32,
+        pub(crate) unk_24: u32,
+        pub(crate) unk_28: u32,
+    }
+
+    #[versions(AGX)]
+    #[derive(Debug)]
+    #[repr(C)]
+    pub(crate) struct JobParameters1<'a> {
+        pub(crate) unk_0: U64,
+        pub(crate) unk_8: F32,
+        pub(crate) unk_c: F32,
+        pub(crate) tvb_tilemap: GpuPointer<'a, &'a [u8]>,
+        #[ver(G < G14)]
+        pub(crate) tvb_cluster_tilemaps: Option<GpuPointer<'a, &'a [u8]>>,
+        pub(crate) tpc: GpuPointer<'a, &'a [u8]>,
+        pub(crate) tvb_heapmeta: GpuPointer<'a, &'a [u8]>,
+        pub(crate) iogpu_unk_54: u32,
+        pub(crate) iogpu_unk_55: u32,
+        pub(crate) iogpu_unk_56: U64,
+        #[ver(G < G14)]
+        pub(crate) tvb_cluster_meta1: Option<GpuPointer<'a, &'a [u8]>>,
+        pub(crate) utile_config: u32,
+        pub(crate) unk_4c: u32,
+        pub(crate) ppp_multisamplectl: U64,
+        pub(crate) tvb_heapmeta_2: GpuPointer<'a, &'a [u8]>,
+        #[ver(G < G14)]
+        pub(crate) unk_60: U64,
+        #[ver(G < G14)]
+        pub(crate) core_mask: Array<2, u32>,
+        pub(crate) preempt_buf1: GpuPointer<'a, &'a [u8]>,
+        pub(crate) preempt_buf2: GpuPointer<'a, &'a [u8]>,
+        pub(crate) unk_80: U64,
+        pub(crate) preempt_buf3: GpuPointer<'a, &'a [u8]>,
+        pub(crate) encoder_addr: U64,
+        #[ver(G < G14)]
+        pub(crate) tvb_cluster_meta2: Option<GpuPointer<'a, &'a [u8]>>,
+        #[ver(G < G14)]
+        pub(crate) tvb_cluster_meta3: Option<GpuPointer<'a, &'a [u8]>>,
+        #[ver(G < G14)]
+        pub(crate) tiling_control: u32,
+        #[ver(G < G14)]
+        pub(crate) unk_ac: u32,
+        pub(crate) unk_b0: Array<6, U64>,
+        pub(crate) pipeline_base: U64,
+        #[ver(G < G14)]
+        pub(crate) tvb_cluster_meta4: Option<GpuPointer<'a, &'a [u8]>>,
+        #[ver(G < G14)]
+        pub(crate) unk_f0: U64,
+        pub(crate) unk_f8: U64,
+        pub(crate) unk_100: Array<3, U64>,
+        pub(crate) unk_118: u32,
+        #[ver(G >= G14)]
+        pub(crate) __pad: Pad<{ 8 * 9 }>,
+    }
+
+    #[derive(Debug)]
+    #[repr(C)]
+    pub(crate) struct JobParameters2<'a> {
+        pub(crate) unk_480: Array<4, u32>,
+        pub(crate) unk_498: U64,
+        pub(crate) unk_4a0: u32,
+        pub(crate) preempt_buf1: GpuPointer<'a, &'a [u8]>,
+        pub(crate) unk_4ac: u32,
+        pub(crate) unk_4b0: U64,
+        pub(crate) unk_4b8: u32,
+        pub(crate) unk_4bc: U64,
+        pub(crate) unk_4c4_padding: Array<0x48, u8>,
+        pub(crate) unk_50c: u32,
+        pub(crate) unk_510: U64,
+        pub(crate) unk_518: U64,
+        pub(crate) unk_520: U64,
+    }
+
+    #[versions(AGX)]
+    #[derive(Debug)]
+    #[repr(C)]
+    pub(crate) struct RunVertex<'a> {
+        pub(crate) tag: workqueue::CommandType,
+
+        #[ver(V >= V13_0B4)]
+        pub(crate) counter: U64,
+
+        pub(crate) vm_slot: u32,
+        pub(crate) unk_8: u32,
+        pub(crate) notifier: GpuPointer<'a, event::Notifier::ver>,
+        pub(crate) buffer_slot: u32,
+        pub(crate) unk_1c: u32,
+        pub(crate) buffer: GpuPointer<'a, fw::buffer::Info::ver>,
+        pub(crate) scene: GpuPointer<'a, fw::buffer::Scene::ver>,
+        pub(crate) unk_buffer_buf: GpuWeakPointer<[u8]>,
+        pub(crate) unk_34: u32,
+        pub(crate) job_params1: JobParameters1::ver<'a>,
+        pub(crate) unk_154: Array<0x268, u8>,
+        pub(crate) tiling_params: TilingParameters,
+        pub(crate) unk_3e8: Array<0x74, u8>,
+        pub(crate) tpc: GpuPointer<'a, &'a [u8]>,
+        pub(crate) tpc_size: U64,
+        pub(crate) microsequence: GpuPointer<'a, &'a [u8]>,
+        pub(crate) microsequence_size: u32,
+        pub(crate) fragment_stamp_slot: u32,
+        pub(crate) fragment_stamp_value: EventValue,
+        pub(crate) unk_pointee: u32,
+        pub(crate) unk_pad: u32,
+        pub(crate) job_params2: JobParameters2<'a>,
+        pub(crate) encoder_params: job::raw::EncoderParams<'a>,
+        pub(crate) unk_55c: u32,
+        pub(crate) unk_560: u32,
+        pub(crate) memoryless_rts_used: u32,
+        pub(crate) unk_568: u32,
+        pub(crate) unk_56c: u32,
+        pub(crate) meta: job::raw::JobMeta,
+        pub(crate) unk_after_meta: u32,
+        pub(crate) unk_buf_0: U64,
+        pub(crate) unk_buf_8: U64,
+        pub(crate) unk_buf_10: U64,
+        pub(crate) cur_ts: U64,
+        pub(crate) start_ts: Option<GpuPointer<'a, AtomicU64>>,
+        pub(crate) end_ts: Option<GpuPointer<'a, AtomicU64>>,
+        pub(crate) unk_5c4: u32,
+        pub(crate) unk_5c8: u32,
+        pub(crate) unk_5cc: u32,
+        pub(crate) unk_5d0: u32,
+        pub(crate) client_sequence: u8,
+        pub(crate) pad_5d5: Array<3, u8>,
+        pub(crate) unk_5d8: u32,
+        pub(crate) unk_5dc: u8,
+
+        #[ver(V >= V13_0B4)]
+        pub(crate) unk_ts: U64,
+
+        #[ver(V >= V13_0B4)]
+        pub(crate) unk_5dd_8: Array<0x1b, u8>,
+    }
+}
+
+#[versions(AGX)]
+#[derive(Debug)]
+pub(crate) struct RunVertex {
+    pub(crate) notifier: Arc<GpuObject<event::Notifier::ver>>,
+    pub(crate) scene: Arc<buffer::Scene::ver>,
+    pub(crate) micro_seq: microseq::MicroSequence,
+    pub(crate) vm_bind: mmu::VmBind,
+    pub(crate) timestamps: Arc<GpuObject<job::RenderTimestamps>>,
+}
+
+#[versions(AGX)]
+impl GpuStruct for RunVertex::ver {
+    type Raw<'a> = raw::RunVertex::ver<'a>;
+}
+
+#[versions(AGX)]
+impl workqueue::Command for RunVertex::ver {}
diff --git a/drivers/gpu/drm/asahi/fw/workqueue.rs b/drivers/gpu/drm/asahi/fw/workqueue.rs
new file mode 100644
index 000000000000..e81025b6c014
--- /dev/null
+++ b/drivers/gpu/drm/asahi/fw/workqueue.rs
@@ -0,0 +1,168 @@
+// SPDX-License-Identifier: GPL-2.0-only OR MIT
+
+//! GPU work queue firmware structes
+
+use super::event;
+use super::types::*;
+use crate::event::EventValue;
+use crate::{default_zeroed, trivial_gpustruct};
+use kernel::sync::Arc;
+
+#[derive(Debug)]
+#[repr(u32)]
+pub(crate) enum CommandType {
+    RunVertex = 0,
+    RunFragment = 1,
+    #[allow(dead_code)]
+    RunBlitter = 2,
+    RunCompute = 3,
+    Barrier = 4,
+    InitBuffer = 6,
+}
+
+pub(crate) trait Command: GpuStruct + Send + Sync {}
+
+pub(crate) mod raw {
+    use super::*;
+
+    #[derive(Debug)]
+    #[repr(C)]
+    pub(crate) struct Barrier {
+        pub(crate) tag: CommandType,
+        pub(crate) wait_stamp: GpuWeakPointer<FwStamp>,
+        pub(crate) wait_value: EventValue,
+        pub(crate) wait_slot: u32,
+        pub(crate) stamp_self: EventValue,
+        pub(crate) uuid: u32,
+        pub(crate) unk: u32,
+    }
+
+    #[derive(Debug, Clone, Copy)]
+    #[repr(C)]
+    pub(crate) struct GpuContextData {
+        pub(crate) unk_0: u8,
+        pub(crate) unk_1: u8,
+        unk_2: Array<0x2, u8>,
+        pub(crate) unk_4: u8,
+        pub(crate) unk_5: u8,
+        unk_6: Array<0x18, u8>,
+        pub(crate) unk_1e: u8,
+        pub(crate) unk_1f: u8,
+        unk_20: Array<0x3, u8>,
+        pub(crate) unk_23: u8,
+        unk_24: Array<0x1c, u8>,
+    }
+
+    impl Default for GpuContextData {
+        fn default() -> Self {
+            Self {
+                unk_0: 0xff,
+                unk_1: 0xff,
+                unk_2: Default::default(),
+                unk_4: 0,
+                unk_5: 1,
+                unk_6: Default::default(),
+                unk_1e: 0xff,
+                unk_1f: 0,
+                unk_20: Default::default(),
+                unk_23: 2,
+                unk_24: Default::default(),
+            }
+        }
+    }
+
+    #[derive(Debug)]
+    #[repr(C)]
+    pub(crate) struct RingState {
+        pub(crate) gpu_doneptr: AtomicU32,
+        __pad0: Pad<0xc>,
+        pub(crate) unk_10: AtomicU32,
+        __pad1: Pad<0xc>,
+        pub(crate) unk_20: AtomicU32,
+        __pad2: Pad<0xc>,
+        pub(crate) gpu_rptr: AtomicU32,
+        __pad3: Pad<0xc>,
+        pub(crate) cpu_wptr: AtomicU32,
+        __pad4: Pad<0xc>,
+        pub(crate) rb_size: u32,
+        __pad5: Pad<0xc>,
+        // This isn't part of the structure, but it's here as a
+        // debugging hack so we can inspect what ring position
+        // the driver considered complete and freeable.
+        pub(crate) cpu_freeptr: AtomicU32,
+        __pad6: Pad<0xc>,
+    }
+    default_zeroed!(RingState);
+
+    #[derive(Debug, Clone, Copy)]
+    #[repr(C)]
+    pub(crate) struct Priority(u32, u32, U64, u32, u32, u32);
+
+    pub(crate) const PRIORITY: [Priority; 4] = [
+        Priority(0, 0, U64(0xffff_ffff_ffff_0000), 1, 0, 1),
+        Priority(1, 1, U64(0xffff_ffff_0000_0000), 0, 0, 0),
+        Priority(2, 2, U64(0xffff_0000_0000_0000), 0, 0, 2),
+        Priority(3, 3, U64(0x0000_0000_0000_0000), 0, 0, 3),
+    ];
+
+    impl Default for Priority {
+        fn default() -> Priority {
+            PRIORITY[2]
+        }
+    }
+
+    #[versions(AGX)]
+    #[derive(Debug)]
+    #[repr(C)]
+    pub(crate) struct QueueInfo<'a> {
+        pub(crate) state: GpuPointer<'a, super::RingState>,
+        pub(crate) ring: GpuPointer<'a, &'a [u64]>,
+        pub(crate) notifier_list: GpuPointer<'a, event::NotifierList>,
+        pub(crate) gpu_buf: GpuPointer<'a, &'a [u8]>,
+        pub(crate) gpu_rptr1: AtomicU32,
+        pub(crate) gpu_rptr2: AtomicU32,
+        pub(crate) gpu_rptr3: AtomicU32,
+        pub(crate) event_id: AtomicI32,
+        pub(crate) priority: Priority,
+        pub(crate) unk_4c: i32,
+        pub(crate) uuid: u32,
+        pub(crate) unk_54: i32,
+        pub(crate) unk_58: U64,
+        pub(crate) busy: AtomicU32,
+        pub(crate) __pad: Pad<0x20>,
+        pub(crate) unk_84_state: AtomicU32,
+        pub(crate) unk_88: u32,
+        pub(crate) unk_8c: u32,
+        pub(crate) unk_90: u32,
+        pub(crate) unk_94: u32,
+        pub(crate) pending: AtomicU32,
+        pub(crate) unk_9c: u32,
+        #[ver(V >= V13_2)]
+        pub(crate) unk_a0_0: u32,
+        pub(crate) gpu_context: GpuPointer<'a, super::GpuContextData>,
+        pub(crate) unk_a8: U64,
+        #[ver(V >= V13_2)]
+        pub(crate) unk_b0: u32,
+    }
+}
+
+trivial_gpustruct!(Barrier);
+trivial_gpustruct!(GpuContextData);
+trivial_gpustruct!(RingState);
+
+impl Command for Barrier {}
+
+#[versions(AGX)]
+#[derive(Debug)]
+pub(crate) struct QueueInfo {
+    pub(crate) state: GpuObject<RingState>,
+    pub(crate) ring: GpuArray<u64>,
+    pub(crate) gpu_buf: GpuArray<u8>,
+    pub(crate) notifier_list: Arc<GpuObject<event::NotifierList>>,
+    pub(crate) gpu_context: Arc<crate::workqueue::GpuContext>,
+}
+
+#[versions(AGX)]
+impl GpuStruct for QueueInfo::ver {
+    type Raw<'a> = raw::QueueInfo::ver<'a>;
+}
diff --git a/drivers/gpu/drm/asahi/gem.rs b/drivers/gpu/drm/asahi/gem.rs
new file mode 100644
index 000000000000..b334bebb0e8e
--- /dev/null
+++ b/drivers/gpu/drm/asahi/gem.rs
@@ -0,0 +1,301 @@
+// SPDX-License-Identifier: GPL-2.0-only OR MIT
+
+//! Asahi driver GEM object implementation
+//!
+//! Basic wrappers and adaptations between generic GEM shmem objects and this driver's
+//! view of what a GPU buffer object is. It is in charge of keeping track of all mappings for
+//! each GEM object so we can remove them when a client (File) or a Vm are destroyed, as well as
+//! implementing RTKit buffers on top of GEM objects for firmware use.
+
+use kernel::{
+    bindings,
+    drm::{gem, gem::shmem},
+    error::Result,
+    prelude::*,
+    soc::apple::rtkit,
+    sync::smutex::Mutex,
+};
+
+use kernel::drm::gem::BaseObject;
+
+use core::sync::atomic::{AtomicU64, Ordering};
+
+use crate::debug::*;
+use crate::driver::AsahiDevice;
+use crate::file::DrmFile;
+
+const DEBUG_CLASS: DebugFlags = DebugFlags::Gem;
+
+/// Represents the inner data of a GEM object for this driver.
+pub(crate) struct DriverObject {
+    /// Whether this is a kernel-created object.
+    kernel: bool,
+    /// Object creation flags.
+    flags: u32,
+    /// VM ID for VM-private objects.
+    vm_id: Option<u64>,
+    /// Locked list of mapping tuples: (file_id, vm_id, mapping)
+    mappings: Mutex<Vec<(u64, u64, crate::mmu::Mapping)>>,
+    /// ID for debug
+    id: u64,
+}
+
+/// Type alias for the shmem GEM object type for this driver.
+pub(crate) type Object = shmem::Object<DriverObject>;
+
+/// Type alias for the SGTable type for this driver.
+pub(crate) type SGTable = shmem::SGTable<DriverObject>;
+
+/// A shared reference to a GEM object for this driver.
+pub(crate) struct ObjectRef {
+    /// The underlying GEM object reference
+    pub(crate) gem: gem::ObjectRef<shmem::Object<DriverObject>>,
+    /// The kernel-side VMap of this object, if needed
+    vmap: Option<shmem::VMap<DriverObject>>,
+}
+
+static GEM_ID: AtomicU64 = AtomicU64::new(0);
+
+impl DriverObject {
+    /// Drop all object mappings for a given file ID.
+    ///
+    /// Used on file close.
+    fn drop_file_mappings(&self, file_id: u64) {
+        let mut mappings = self.mappings.lock();
+        for (index, (mapped_fid, _mapped_vmid, _mapping)) in mappings.iter().enumerate() {
+            if *mapped_fid == file_id {
+                mappings.swap_remove(index);
+                return;
+            }
+        }
+    }
+
+    /// Drop all object mappings for a given VM ID.
+    ///
+    /// Used on VM destroy.
+    fn drop_vm_mappings(&self, vm_id: u64) {
+        let mut mappings = self.mappings.lock();
+        for (index, (_mapped_fid, mapped_vmid, _mapping)) in mappings.iter().enumerate() {
+            if *mapped_vmid == vm_id {
+                mappings.swap_remove(index);
+                return;
+            }
+        }
+    }
+}
+
+impl ObjectRef {
+    /// Create a new wrapper for a raw GEM object reference.
+    pub(crate) fn new(gem: gem::ObjectRef<shmem::Object<DriverObject>>) -> ObjectRef {
+        ObjectRef { gem, vmap: None }
+    }
+
+    /// Return the `VMap` for this object, creating it if necessary.
+    pub(crate) fn vmap(&mut self) -> Result<&mut shmem::VMap<DriverObject>> {
+        if self.vmap.is_none() {
+            self.vmap = Some(self.gem.vmap()?);
+        }
+        Ok(self.vmap.as_mut().unwrap())
+    }
+
+    /// Return the IOVA of this object at which it is mapped in a given `Vm` identified by its ID,
+    /// if it is mapped in that `Vm`.
+    pub(crate) fn iova(&self, vm_id: u64) -> Option<usize> {
+        let mappings = self.gem.mappings.lock();
+        for (_mapped_fid, mapped_vmid, mapping) in mappings.iter() {
+            if *mapped_vmid == vm_id {
+                return Some(mapping.iova());
+            }
+        }
+
+        None
+    }
+
+    /// Returns the size of an object in bytes
+    pub(crate) fn size(&self) -> usize {
+        self.gem.size()
+    }
+
+    /// Maps an object into a given `Vm` at any free address.
+    ///
+    /// Returns Err(EBUSY) if there is already a mapping.
+    pub(crate) fn map_into(&mut self, vm: &crate::mmu::Vm) -> Result<usize> {
+        let vm_id = vm.id();
+
+        if self.gem.vm_id.is_some() && self.gem.vm_id != Some(vm_id) {
+            return Err(EINVAL);
+        }
+
+        let mut mappings = self.gem.mappings.lock();
+        for (_mapped_fid, mapped_vmid, _mapping) in mappings.iter() {
+            if *mapped_vmid == vm_id {
+                return Err(EBUSY);
+            }
+        }
+
+        let sgt = self.gem.sg_table()?;
+        let new_mapping = vm.map(self.gem.size(), sgt)?;
+
+        let iova = new_mapping.iova();
+        mappings.try_push((vm.file_id(), vm_id, new_mapping))?;
+        Ok(iova)
+    }
+
+    /// Maps an object into a given `Vm` at any free address within a given range.
+    ///
+    /// Returns Err(EBUSY) if there is already a mapping.
+    pub(crate) fn map_into_range(
+        &mut self,
+        vm: &crate::mmu::Vm,
+        start: u64,
+        end: u64,
+        alignment: u64,
+        prot: u32,
+        guard: bool,
+    ) -> Result<usize> {
+        let vm_id = vm.id();
+
+        if self.gem.vm_id.is_some() && self.gem.vm_id != Some(vm_id) {
+            return Err(EINVAL);
+        }
+
+        let mut mappings = self.gem.mappings.lock();
+        for (_mapped_fid, mapped_vmid, _mapping) in mappings.iter() {
+            if *mapped_vmid == vm_id {
+                return Err(EBUSY);
+            }
+        }
+
+        let sgt = self.gem.sg_table()?;
+        let new_mapping =
+            vm.map_in_range(self.gem.size(), sgt, alignment, start, end, prot, guard)?;
+
+        let iova = new_mapping.iova();
+        mappings.try_push((vm.file_id(), vm_id, new_mapping))?;
+        Ok(iova)
+    }
+
+    /// Maps an object into a given `Vm` at a specific address.
+    ///
+    /// Returns Err(EBUSY) if there is already a mapping.
+    /// Returns Err(ENOSPC) if the requested address is already busy.
+    pub(crate) fn map_at(
+        &mut self,
+        vm: &crate::mmu::Vm,
+        addr: u64,
+        prot: u32,
+        guard: bool,
+    ) -> Result {
+        let vm_id = vm.id();
+
+        if self.gem.vm_id.is_some() && self.gem.vm_id != Some(vm_id) {
+            return Err(EINVAL);
+        }
+
+        let mut mappings = self.gem.mappings.lock();
+        for (_mapped_fid, mapped_vmid, _mapping) in mappings.iter() {
+            if *mapped_vmid == vm_id {
+                return Err(EBUSY);
+            }
+        }
+
+        let sgt = self.gem.sg_table()?;
+        let new_mapping = vm.map_at(addr, self.gem.size(), sgt, prot, guard)?;
+
+        let iova = new_mapping.iova();
+        assert!(iova == addr as usize);
+        mappings.try_push((vm.file_id(), vm_id, new_mapping))?;
+        Ok(())
+    }
+
+    /// Drop all mappings for this object owned by a given `Vm` identified by its ID.
+    pub(crate) fn drop_vm_mappings(&mut self, vm_id: u64) {
+        self.gem.drop_vm_mappings(vm_id);
+    }
+
+    /// Drop all mappings for this object owned by a given `File` identified by its ID.
+    pub(crate) fn drop_file_mappings(&mut self, file_id: u64) {
+        self.gem.drop_file_mappings(file_id);
+    }
+}
+
+/// Create a new kernel-owned GEM object.
+pub(crate) fn new_kernel_object(dev: &AsahiDevice, size: usize) -> Result<ObjectRef> {
+    let mut gem = shmem::Object::<DriverObject>::new(dev, size)?;
+    gem.kernel = true;
+    gem.flags = 0;
+
+    gem.set_exportable(false);
+
+    mod_pr_debug!("DriverObject new kernel object id={}\n", gem.id);
+    Ok(ObjectRef::new(gem.into_ref()))
+}
+
+/// Create a new user-owned GEM object with the given flags.
+pub(crate) fn new_object(
+    dev: &AsahiDevice,
+    size: usize,
+    flags: u32,
+    vm_id: Option<u64>,
+) -> Result<ObjectRef> {
+    let mut gem = shmem::Object::<DriverObject>::new(dev, size)?;
+    gem.kernel = false;
+    gem.flags = flags;
+    gem.vm_id = vm_id;
+
+    gem.set_exportable(vm_id.is_none());
+    gem.set_wc(flags & bindings::ASAHI_GEM_WRITEBACK == 0);
+
+    mod_pr_debug!(
+        "DriverObject new user object: vm_id={:?} id={}\n",
+        vm_id,
+        gem.id
+    );
+    Ok(ObjectRef::new(gem.into_ref()))
+}
+
+/// Look up a GEM object handle for a `File` and return an `ObjectRef` for it.
+pub(crate) fn lookup_handle(file: &DrmFile, handle: u32) -> Result<ObjectRef> {
+    Ok(ObjectRef::new(shmem::Object::lookup_handle(file, handle)?))
+}
+
+impl gem::BaseDriverObject<Object> for DriverObject {
+    /// Callback to create the inner data of a GEM object
+    fn new(_dev: &AsahiDevice, _size: usize) -> Result<DriverObject> {
+        let id = GEM_ID.fetch_add(1, Ordering::Relaxed);
+        mod_pr_debug!("DriverObject::new id={}\n", id);
+        Ok(DriverObject {
+            kernel: false,
+            flags: 0,
+            vm_id: None,
+            mappings: Mutex::new(Vec::new()),
+            id,
+        })
+    }
+
+    /// Callback to drop all mappings for a GEM object owned by a given `File`
+    fn close(obj: &Object, file: &DrmFile) {
+        mod_pr_debug!("DriverObject::close vm_id={:?} id={}\n", obj.vm_id, obj.id);
+        obj.drop_file_mappings(file.file_id());
+    }
+}
+
+impl Drop for DriverObject {
+    fn drop(&mut self) {
+        mod_pr_debug!("DriverObject::drop vm_id={:?} id={}\n", self.vm_id, self.id);
+    }
+}
+
+impl shmem::DriverObject for DriverObject {
+    type Driver = crate::driver::AsahiDriver;
+}
+
+impl rtkit::Buffer for ObjectRef {
+    fn iova(&self) -> Result<usize> {
+        self.iova(0).ok_or(EIO)
+    }
+    fn buf(&mut self) -> Result<&mut [u8]> {
+        let vmap = self.vmap.as_mut().ok_or(ENOMEM)?;
+        Ok(vmap.as_mut_slice())
+    }
+}
diff --git a/drivers/gpu/drm/asahi/gpu.rs b/drivers/gpu/drm/asahi/gpu.rs
new file mode 100644
index 000000000000..1384b52d11a2
--- /dev/null
+++ b/drivers/gpu/drm/asahi/gpu.rs
@@ -0,0 +1,1088 @@
+// SPDX-License-Identifier: GPL-2.0-only OR MIT
+
+//! Top-level GPU manager
+//!
+//! This module is the root of all GPU firmware management for a given driver instance. It is
+//! responsible for initialization, owning the top-level managers (events, UAT, etc.), and
+//! communicating with the raw RtKit endpoints to send and receive messages to/from the GPU
+//! firmware.
+//!
+//! It is also the point where diverging driver firmware/GPU variants (using the versions macro)
+//! are unified, so that the top level of the driver itself (in `driver`) does not have to concern
+//! itself with version dependence.
+
+use core::any::Any;
+use core::sync::atomic::{AtomicBool, AtomicU64, Ordering};
+use core::time::Duration;
+
+use kernel::{
+    delay::coarse_sleep,
+    error::code::*,
+    macros::versions,
+    prelude::*,
+    soc::apple::rtkit,
+    sync::{smutex::Mutex, Arc, Guard, UniqueArc},
+    time,
+    types::ForeignOwnable,
+};
+
+use crate::alloc::Allocator;
+use crate::box_in_place;
+use crate::debug::*;
+use crate::driver::AsahiDevice;
+use crate::fw::channels::PipeType;
+use crate::fw::types::U64;
+use crate::{
+    alloc, buffer, channel, event, fw, gem, hw, initdata, mem, mmu, queue, regs, workqueue,
+};
+
+const DEBUG_CLASS: DebugFlags = DebugFlags::Gpu;
+
+/// Firmware endpoint for init & incoming notifications.
+const EP_FIRMWARE: u8 = 0x20;
+
+/// Doorbell endpoint for work/message submissions.
+const EP_DOORBELL: u8 = 0x21;
+
+/// Initialize the GPU firmware.
+const MSG_INIT: u64 = 0x81 << 48;
+const INIT_DATA_MASK: u64 = (1 << 44) - 1;
+
+/// TX channel doorbell.
+const MSG_TX_DOORBELL: u64 = 0x83 << 48;
+/// Firmware control channel doorbell.
+const MSG_FWCTL: u64 = 0x84 << 48;
+// /// Halt the firmware (?).
+// const MSG_HALT: u64 = 0x85 << 48;
+
+/// Receive channel doorbell notification.
+const MSG_RX_DOORBELL: u64 = 0x42 << 48;
+
+/// Doorbell number for firmware kicks/wakeups.
+const DOORBELL_KICKFW: u64 = 0x10;
+/// Doorbell number for device control channel kicks.
+const DOORBELL_DEVCTRL: u64 = 0x11;
+
+// Upper kernel half VA address ranges.
+/// Private (cached) firmware structure VA range base.
+const IOVA_KERN_PRIV_BASE: u64 = 0xffffffa000000000;
+/// Private (cached) firmware structure VA range top.
+const IOVA_KERN_PRIV_TOP: u64 = 0xffffffa7ffffffff;
+/// Shared (uncached) firmware structure VA range base.
+const IOVA_KERN_SHARED_BASE: u64 = 0xffffffa800000000;
+/// Shared (uncached) firmware structure VA range top.
+const IOVA_KERN_SHARED_TOP: u64 = 0xffffffa9ffffffff;
+/// Shared (uncached) read-only firmware structure VA range base.
+const IOVA_KERN_SHARED_RO_BASE: u64 = 0xffffffaa00000000;
+/// Shared (uncached) read-only firmware structure VA range top.
+const IOVA_KERN_SHARED_RO_TOP: u64 = 0xffffffabffffffff;
+/// GPU/FW shared structure VA range base.
+const IOVA_KERN_GPU_BASE: u64 = 0xffffffaf00000000;
+/// GPU/FW shared structure VA range top.
+const IOVA_KERN_GPU_TOP: u64 = 0xffffffafffffffff;
+
+/// Timeout for entering the halt state after a fault or request.
+const HALT_ENTER_TIMEOUT_MS: u64 = 100;
+
+/// Global allocators used for kernel-half structures.
+pub(crate) struct KernelAllocators {
+    pub(crate) private: alloc::DefaultAllocator,
+    pub(crate) shared: alloc::DefaultAllocator,
+    pub(crate) shared_ro: alloc::DefaultAllocator,
+    pub(crate) gpu: alloc::DefaultAllocator,
+}
+
+/// Receive (GPU->driver) ring buffer channels.
+#[versions(AGX)]
+struct RxChannels {
+    event: channel::EventChannel,
+    fw_log: channel::FwLogChannel,
+    ktrace: channel::KTraceChannel,
+    stats: channel::StatsChannel::ver,
+}
+
+/// GPU work submission pipe channels (driver->GPU).
+#[versions(AGX)]
+struct PipeChannels {
+    pub(crate) vtx: Vec<Mutex<channel::PipeChannel::ver>>,
+    pub(crate) frag: Vec<Mutex<channel::PipeChannel::ver>>,
+    pub(crate) comp: Vec<Mutex<channel::PipeChannel::ver>>,
+}
+
+/// Misc command transmit (driver->GPU) channels.
+#[versions(AGX)]
+struct TxChannels {
+    pub(crate) device_control: channel::DeviceControlChannel::ver,
+}
+
+/// Number of work submission pipes per type, one for each priority level.
+const NUM_PIPES: usize = 4;
+
+/// A generic monotonically incrementing ID used to uniquely identify object instances within the
+/// driver.
+pub(crate) struct ID(AtomicU64);
+
+impl ID {
+    /// Create a new ID counter with a given value.
+    fn new(val: u64) -> ID {
+        ID(AtomicU64::new(val))
+    }
+
+    /// Fetch the next unique ID.
+    pub(crate) fn next(&self) -> u64 {
+        self.0.fetch_add(1, Ordering::Relaxed)
+    }
+}
+
+impl Default for ID {
+    /// IDs default to starting at 2, as 0/1 are considered reserved for the system.
+    fn default() -> Self {
+        Self::new(2)
+    }
+}
+
+/// A guard representing one active submission on the GPU. When dropped, decrements the active
+/// submission count.
+pub(crate) struct OpGuard(Arc<dyn GpuManagerPriv>);
+
+impl Drop for OpGuard {
+    fn drop(&mut self) {
+        self.0.end_op();
+    }
+}
+
+/// Set of global sequence IDs used in the driver.
+#[derive(Default)]
+pub(crate) struct SequenceIDs {
+    /// `File` instance ID.
+    pub(crate) file: ID,
+    /// `Vm` instance ID.
+    pub(crate) vm: ID,
+    /// Submission instance ID.
+    pub(crate) submission: ID,
+    /// `Queue` instance ID.
+    pub(crate) queue: ID,
+}
+
+/// Top-level GPU manager that owns all the global state relevant to the driver instance.
+#[versions(AGX)]
+pub(crate) struct GpuManager {
+    dev: AsahiDevice,
+    cfg: &'static hw::HwConfig,
+    dyncfg: Box<hw::DynConfig>,
+    pub(crate) initdata: Box<fw::types::GpuObject<fw::initdata::InitData::ver>>,
+    uat: Box<mmu::Uat>,
+    crashed: AtomicBool,
+    alloc: Mutex<KernelAllocators>,
+    io_mappings: Vec<mmu::Mapping>,
+    rtkit: Mutex<Option<Box<rtkit::RtKit<GpuManager::ver>>>>,
+    rx_channels: Mutex<Box<RxChannels::ver>>,
+    tx_channels: Mutex<Box<TxChannels::ver>>,
+    fwctl_channel: Mutex<Box<channel::FwCtlChannel>>,
+    pipes: PipeChannels::ver,
+    event_manager: Arc<event::EventManager>,
+    buffer_mgr: buffer::BufferManager,
+    ids: SequenceIDs,
+}
+
+/// Trait used to abstract the firmware/GPU-dependent variants of the GpuManager.
+pub(crate) trait GpuManager: Send + Sync {
+    /// Cast as an Any type.
+    fn as_any(&self) -> &dyn Any;
+    /// Cast Arc<Self> as an Any type.
+    fn arc_as_any(self: Arc<Self>) -> Arc<dyn Any + Sync + Send>;
+    /// Initialize the GPU.
+    fn init(&self) -> Result;
+    /// Update the GPU globals from global info
+    ///
+    /// TODO: Unclear what can and cannot be updated like this.
+    fn update_globals(&self);
+    /// Get a reference to the KernelAllocators.
+    fn alloc(&self) -> Guard<'_, Mutex<KernelAllocators>>;
+    /// Create a new `Vm` given a unique `File` ID.
+    fn new_vm(&self, file_id: u64) -> Result<mmu::Vm>;
+    /// Bind a `Vm` to an available slot and return the `VmBind`.
+    fn bind_vm(&self, vm: &mmu::Vm) -> Result<mmu::VmBind>;
+    /// Create a new user command queue.
+    fn new_queue(
+        &self,
+        vm: mmu::Vm,
+        ualloc: Arc<Mutex<alloc::DefaultAllocator>>,
+        ualloc_priv: Arc<Mutex<alloc::DefaultAllocator>>,
+        priority: u32,
+        caps: u32,
+    ) -> Result<Box<dyn queue::Queue>>;
+    /// Return a reference to the global `SequenceIDs` instance.
+    fn ids(&self) -> &SequenceIDs;
+    /// Kick the firmware (wake it up if asleep).
+    ///
+    /// This should be useful to reduce latency on work submission, so we can ask the firmware to
+    /// wake up while we do some preparatory work for the work submission.
+    fn kick_firmware(&self) -> Result;
+    /// Invalidate a GPU scheduler context. Must be called before the relevant structures are freed.
+    fn invalidate_context(
+        &self,
+        context: &fw::types::GpuObject<fw::workqueue::GpuContextData>,
+    ) -> Result;
+    /// Flush the entire firmware cache.
+    ///
+    /// TODO: Does this actually work?
+    fn flush_fw_cache(&self) -> Result;
+    /// Handle a GPU work timeout event.
+    fn handle_timeout(&self, counter: u32, event_slot: u32);
+    /// Handle a GPU fault event.
+    fn handle_fault(&self);
+    /// Wait for the GPU to become idle and power off.
+    fn wait_for_poweroff(&self, timeout: usize) -> Result;
+    /// Send a firmware control command (secure cache flush).
+    fn fwctl(&self, msg: fw::channels::FwCtlMsg) -> Result;
+    /// Get the static GPU configuration for this SoC.
+    fn get_cfg(&self) -> &'static hw::HwConfig;
+    /// Get the dynamic GPU configuration for this SoC.
+    fn get_dyncfg(&self) -> &hw::DynConfig;
+}
+
+/// Private generic trait for functions that don't need to escape this module.
+trait GpuManagerPriv {
+    /// Decrement the pending submission counter.
+    fn end_op(&self);
+}
+
+#[versions(AGX)]
+#[vtable]
+impl rtkit::Operations for GpuManager::ver {
+    type Data = Arc<GpuManager::ver>;
+    type Buffer = gem::ObjectRef;
+
+    fn recv_message(data: <Self::Data as ForeignOwnable>::Borrowed<'_>, ep: u8, msg: u64) {
+        let dev = &data.dev;
+        //dev_info!(dev, "RtKit message: {:#x}:{:#x}\n", ep, msg);
+
+        if ep != EP_FIRMWARE || msg != MSG_RX_DOORBELL {
+            dev_err!(dev, "Unknown message: {:#x}:{:#x}\n", ep, msg);
+            return;
+        }
+
+        let mut ch = data.rx_channels.lock();
+
+        ch.fw_log.poll();
+        ch.ktrace.poll();
+        ch.stats.poll();
+        ch.event.poll();
+    }
+
+    fn crashed(data: <Self::Data as ForeignOwnable>::Borrowed<'_>) {
+        let dev = &data.dev;
+        dev_err!(dev, "GPU firmware crashed, failing all jobs\n");
+
+        data.crashed.store(true, Ordering::Relaxed);
+        data.event_manager.fail_all(workqueue::WorkError::NoDevice);
+    }
+
+    fn shmem_alloc(
+        data: <Self::Data as ForeignOwnable>::Borrowed<'_>,
+        size: usize,
+    ) -> Result<Self::Buffer> {
+        let dev = &data.dev;
+        mod_dev_dbg!(dev, "shmem_alloc() {:#x} bytes\n", size);
+
+        let mut obj = gem::new_kernel_object(dev, size)?;
+        obj.vmap()?;
+        let iova = obj.map_into(data.uat.kernel_vm())?;
+        mod_dev_dbg!(dev, "shmem_alloc() -> VA {:#x}\n", iova);
+        Ok(obj)
+    }
+}
+
+#[versions(AGX)]
+impl GpuManager::ver {
+    /// Create a new GpuManager of this version/GPU combination.
+    #[inline(never)]
+    pub(crate) fn new(
+        dev: &AsahiDevice,
+        res: &regs::Resources,
+        cfg: &'static hw::HwConfig,
+    ) -> Result<Arc<GpuManager::ver>> {
+        let uat = Self::make_uat(dev, cfg)?;
+        let dyncfg = Self::make_dyncfg(dev, res, cfg, &uat)?;
+
+        let mut alloc = KernelAllocators {
+            private: alloc::DefaultAllocator::new(
+                dev,
+                uat.kernel_vm(),
+                IOVA_KERN_PRIV_BASE,
+                IOVA_KERN_PRIV_TOP,
+                0x80,
+                mmu::PROT_FW_PRIV_RW,
+                1024 * 1024,
+                true,
+                fmt!("Kernel Private"),
+                true,
+            )?,
+            shared: alloc::DefaultAllocator::new(
+                dev,
+                uat.kernel_vm(),
+                IOVA_KERN_SHARED_BASE,
+                IOVA_KERN_SHARED_TOP,
+                0x80,
+                mmu::PROT_FW_SHARED_RW,
+                1024 * 1024,
+                true,
+                fmt!("Kernel Shared"),
+                false,
+            )?,
+            shared_ro: alloc::DefaultAllocator::new(
+                dev,
+                uat.kernel_vm(),
+                IOVA_KERN_SHARED_RO_BASE,
+                IOVA_KERN_SHARED_RO_TOP,
+                0x80,
+                mmu::PROT_FW_SHARED_RO,
+                64 * 1024,
+                true,
+                fmt!("Kernel RO Shared"),
+                false,
+            )?,
+            gpu: alloc::DefaultAllocator::new(
+                dev,
+                uat.kernel_vm(),
+                IOVA_KERN_GPU_BASE,
+                IOVA_KERN_GPU_TOP,
+                0x80,
+                mmu::PROT_GPU_FW_SHARED_RW,
+                64 * 1024,
+                true,
+                fmt!("Kernel GPU Shared"),
+                false,
+            )?,
+        };
+
+        let event_manager = Self::make_event_manager(&mut alloc)?;
+        let initdata = Self::make_initdata(cfg, &dyncfg, &mut alloc)?;
+        let mut mgr = Self::make_mgr(dev, cfg, dyncfg, uat, alloc, event_manager, initdata)?;
+
+        {
+            let fwctl = mgr.fwctl_channel.lock();
+            let p_fwctl = fwctl.to_raw();
+            core::mem::drop(fwctl);
+
+            mgr.initdata.fw_status.with_mut(|raw, _inner| {
+                raw.fwctl_channel = p_fwctl;
+            });
+        }
+
+        {
+            let txc = mgr.tx_channels.lock();
+            let p_device_control = txc.device_control.to_raw();
+            core::mem::drop(txc);
+
+            let rxc = mgr.rx_channels.lock();
+            let p_event = rxc.event.to_raw();
+            let p_fw_log = rxc.fw_log.to_raw();
+            let p_ktrace = rxc.ktrace.to_raw();
+            let p_stats = rxc.stats.to_raw();
+            let p_fwlog_buf = rxc.fw_log.get_buf();
+            core::mem::drop(rxc);
+
+            mgr.initdata.runtime_pointers.with_mut(|raw, _inner| {
+                raw.device_control = p_device_control;
+                raw.event = p_event;
+                raw.fw_log = p_fw_log;
+                raw.ktrace = p_ktrace;
+                raw.stats = p_stats;
+                raw.fwlog_buf = Some(p_fwlog_buf);
+            });
+        }
+
+        let mut p_pipes: Vec<fw::initdata::raw::PipeChannels::ver> = Vec::new();
+
+        for ((v, f), c) in mgr
+            .pipes
+            .vtx
+            .iter()
+            .zip(&mgr.pipes.frag)
+            .zip(&mgr.pipes.comp)
+        {
+            p_pipes.try_push(fw::initdata::raw::PipeChannels::ver {
+                vtx: v.lock().to_raw(),
+                frag: f.lock().to_raw(),
+                comp: c.lock().to_raw(),
+            })?;
+        }
+
+        mgr.initdata.runtime_pointers.with_mut(|raw, _inner| {
+            for (i, p) in p_pipes.into_iter().enumerate() {
+                raw.pipes[i].vtx = p.vtx;
+                raw.pipes[i].frag = p.frag;
+                raw.pipes[i].comp = p.comp;
+            }
+        });
+
+        for (i, map) in cfg.io_mappings.iter().enumerate() {
+            if let Some(map) = map.as_ref() {
+                mgr.iomap(i, map)?;
+            }
+        }
+
+        let mgr = Arc::from(mgr);
+
+        let rtkit = Box::try_new(rtkit::RtKit::<GpuManager::ver>::new(
+            dev,
+            None,
+            0,
+            mgr.clone(),
+        )?)?;
+
+        *mgr.rtkit.lock() = Some(rtkit);
+
+        {
+            let mut rxc = mgr.rx_channels.lock();
+            rxc.event.set_manager(mgr.clone());
+        }
+
+        Ok(mgr)
+    }
+
+    /// Build the entire GPU InitData structure tree and return it as a boxed GpuObject.
+    fn make_initdata(
+        cfg: &'static hw::HwConfig,
+        dyncfg: &hw::DynConfig,
+        alloc: &mut KernelAllocators,
+    ) -> Result<Box<fw::types::GpuObject<fw::initdata::InitData::ver>>> {
+        let mut builder = initdata::InitDataBuilder::ver::new(alloc, cfg, dyncfg);
+        builder.build()
+    }
+
+    /// Create a fresh boxed Uat instance.
+    ///
+    /// Force disable inlining to avoid blowing up the stack.
+    #[inline(never)]
+    fn make_uat(dev: &AsahiDevice, cfg: &'static hw::HwConfig) -> Result<Box<mmu::Uat>> {
+        Ok(Box::try_new(mmu::Uat::new(dev, cfg)?)?)
+    }
+
+    /// Actually create the final GpuManager instance, as a UniqueArc.
+    ///
+    /// Force disable inlining to avoid blowing up the stack.
+    #[inline(never)]
+    fn make_mgr(
+        dev: &AsahiDevice,
+        cfg: &'static hw::HwConfig,
+        dyncfg: Box<hw::DynConfig>,
+        uat: Box<mmu::Uat>,
+        mut alloc: KernelAllocators,
+        event_manager: Arc<event::EventManager>,
+        initdata: Box<fw::types::GpuObject<fw::initdata::InitData::ver>>,
+    ) -> Result<UniqueArc<GpuManager::ver>> {
+        let mut pipes = PipeChannels::ver {
+            vtx: Vec::new(),
+            frag: Vec::new(),
+            comp: Vec::new(),
+        };
+
+        for _i in 0..=NUM_PIPES - 1 {
+            pipes
+                .vtx
+                .try_push(Mutex::new(channel::PipeChannel::ver::new(dev, &mut alloc)?))?;
+            pipes
+                .frag
+                .try_push(Mutex::new(channel::PipeChannel::ver::new(dev, &mut alloc)?))?;
+            pipes
+                .comp
+                .try_push(Mutex::new(channel::PipeChannel::ver::new(dev, &mut alloc)?))?;
+        }
+
+        UniqueArc::try_new(GpuManager::ver {
+            dev: dev.clone(),
+            cfg,
+            dyncfg,
+            initdata,
+            uat,
+            io_mappings: Vec::new(),
+            rtkit: Mutex::new(None),
+            crashed: AtomicBool::new(false),
+            rx_channels: Mutex::new(box_in_place!(RxChannels::ver {
+                event: channel::EventChannel::new(dev, &mut alloc, event_manager.clone())?,
+                fw_log: channel::FwLogChannel::new(dev, &mut alloc)?,
+                ktrace: channel::KTraceChannel::new(dev, &mut alloc)?,
+                stats: channel::StatsChannel::ver::new(dev, &mut alloc)?,
+            })?),
+            tx_channels: Mutex::new(Box::try_new(TxChannels::ver {
+                device_control: channel::DeviceControlChannel::ver::new(dev, &mut alloc)?,
+            })?),
+            fwctl_channel: Mutex::new(Box::try_new(channel::FwCtlChannel::new(dev, &mut alloc)?)?),
+            pipes,
+            event_manager,
+            buffer_mgr: buffer::BufferManager::new()?,
+            alloc: Mutex::new(alloc),
+            ids: Default::default(),
+        })
+    }
+
+    /// Fetch and validate the GPU dynamic configuration from the device tree and hardware.
+    ///
+    /// Force disable inlining to avoid blowing up the stack.
+    #[inline(never)]
+    fn make_dyncfg(
+        dev: &AsahiDevice,
+        res: &regs::Resources,
+        cfg: &'static hw::HwConfig,
+        uat: &mmu::Uat,
+    ) -> Result<Box<hw::DynConfig>> {
+        let gpu_id = res.get_gpu_id()?;
+
+        dev_info!(dev, "GPU Information:\n");
+        dev_info!(
+            dev,
+            "  Type: {:?}{:?}\n",
+            gpu_id.gpu_gen,
+            gpu_id.gpu_variant
+        );
+        dev_info!(dev, "  Max dies: {}\n", gpu_id.max_dies);
+        dev_info!(dev, "  Clusters: {}\n", gpu_id.num_clusters);
+        dev_info!(
+            dev,
+            "  Cores: {} ({})\n",
+            gpu_id.num_cores,
+            gpu_id.num_cores * gpu_id.num_clusters
+        );
+        dev_info!(
+            dev,
+            "  Frags: {} ({})\n",
+            gpu_id.num_frags,
+            gpu_id.num_frags * gpu_id.num_clusters
+        );
+        dev_info!(
+            dev,
+            "  GPs: {} ({})\n",
+            gpu_id.num_gps,
+            gpu_id.num_gps * gpu_id.num_clusters
+        );
+        dev_info!(dev, "  Core masks: {:#x?}\n", gpu_id.core_masks);
+        dev_info!(dev, "  Active cores: {}\n", gpu_id.total_active_cores);
+
+        dev_info!(dev, "Getting configuration from device tree...\n");
+        let pwr_cfg = hw::PwrConfig::load(dev, cfg)?;
+        dev_info!(dev, "Dynamic configuration fetched\n");
+
+        if gpu_id.gpu_gen != cfg.gpu_gen || gpu_id.gpu_variant != cfg.gpu_variant {
+            dev_err!(
+                dev,
+                "GPU type mismatch (expected {:?}{:?}, found {:?}{:?})\n",
+                cfg.gpu_gen,
+                cfg.gpu_variant,
+                gpu_id.gpu_gen,
+                gpu_id.gpu_variant
+            );
+            return Err(EIO);
+        }
+        if gpu_id.num_clusters > cfg.max_num_clusters {
+            dev_err!(
+                dev,
+                "Too many clusters ({} > {})\n",
+                gpu_id.num_clusters,
+                cfg.max_num_clusters
+            );
+            return Err(EIO);
+        }
+        if gpu_id.num_cores > cfg.max_num_cores {
+            dev_err!(
+                dev,
+                "Too many cores ({} > {})\n",
+                gpu_id.num_cores,
+                cfg.max_num_cores
+            );
+            return Err(EIO);
+        }
+        if gpu_id.num_frags > cfg.max_num_frags {
+            dev_err!(
+                dev,
+                "Too many frags ({} > {})\n",
+                gpu_id.num_frags,
+                cfg.max_num_frags
+            );
+            return Err(EIO);
+        }
+        if gpu_id.num_gps > cfg.max_num_gps {
+            dev_err!(
+                dev,
+                "Too many GPs ({} > {})\n",
+                gpu_id.num_gps,
+                cfg.max_num_gps
+            );
+            return Err(EIO);
+        }
+
+        Ok(Box::try_new(hw::DynConfig {
+            pwr: pwr_cfg,
+            uat_ttb_base: uat.ttb_base(),
+            id: gpu_id,
+        })?)
+    }
+
+    /// Create the global GPU event manager, and return an `Arc<>` to it.
+    fn make_event_manager(alloc: &mut KernelAllocators) -> Result<Arc<event::EventManager>> {
+        Arc::try_new(event::EventManager::new(alloc)?)
+    }
+
+    /// Create a new MMIO mapping and add it to the mappings list in initdata at the specified
+    /// index.
+    fn iomap(&mut self, index: usize, map: &hw::IOMapping) -> Result {
+        let off = map.base & mmu::UAT_PGMSK;
+        let base = map.base - off;
+        let end = (map.base + map.size + mmu::UAT_PGMSK) & !mmu::UAT_PGMSK;
+        let mapping = self
+            .uat
+            .kernel_vm()
+            .map_io(base, end - base, map.writable)?;
+
+        self.initdata.runtime_pointers.hwdata_b.with_mut(|raw, _| {
+            raw.io_mappings[index] = fw::initdata::raw::IOMapping {
+                phys_addr: U64(map.base as u64),
+                virt_addr: U64((mapping.iova() + off) as u64),
+                size: map.size as u32,
+                range_size: map.range_size as u32,
+                readwrite: U64(map.writable as u64),
+            };
+        });
+
+        self.io_mappings.try_push(mapping)?;
+        Ok(())
+    }
+
+    /// Mark work associated with currently in-progress event slots as failed, after a fault or
+    /// timeout.
+    fn mark_pending_events(&self, culprit_slot: Option<u32>, error: workqueue::WorkError) {
+        dev_err!(self.dev, "  Pending events:\n");
+
+        self.initdata.globals.with(|raw, _inner| {
+            for i in raw.pending_stamps.iter() {
+                let info = i.info.load(Ordering::Relaxed);
+                let wait_value = i.wait_value.load(Ordering::Relaxed);
+
+                if info & 1 != 0 {
+                    let slot = info >> 3;
+                    let flags = info & 0x7;
+                    dev_err!(
+                        self.dev,
+                        "    [{}] flags={} value={:#x}\n",
+                        slot,
+                        flags,
+                        wait_value
+                    );
+                    let error = if culprit_slot.is_some() && culprit_slot != Some(slot) {
+                        workqueue::WorkError::Killed
+                    } else {
+                        error
+                    };
+                    self.event_manager.mark_error(slot, wait_value, error);
+                    i.info.store(0, Ordering::Relaxed);
+                    i.wait_value.store(0, Ordering::Relaxed);
+                }
+            }
+        });
+    }
+
+    /// Fetch the GPU MMU fault information from the hardware registers.
+    fn get_fault_info(&self) -> Option<regs::FaultInfo> {
+        let data = self.dev.data();
+
+        let res = match data.resources() {
+            Some(res) => res,
+            None => {
+                dev_err!(self.dev, "  Failed to acquire resources\n");
+                return None;
+            }
+        };
+
+        let info = res.get_fault_info();
+        if info.is_some() {
+            dev_err!(self.dev, "  Fault info: {:#x?}\n", info.as_ref().unwrap());
+        }
+        info
+    }
+
+    /// Resume the GPU firmware after it halts (due to a timeout, fault, or request).
+    fn recover(&self) {
+        self.initdata.fw_status.with(|raw, _inner| {
+            let halt_count = raw.flags.halt_count.load(Ordering::Relaxed);
+            let mut halted = raw.flags.halted.load(Ordering::Relaxed);
+            dev_err!(self.dev, "  Halt count: {}\n", halt_count);
+            dev_err!(self.dev, "  Halted: {}\n", halted);
+
+            if halted == 0 {
+                let timeout = time::ktime_get() + Duration::from_millis(HALT_ENTER_TIMEOUT_MS);
+                while time::ktime_get() < timeout {
+                    halted = raw.flags.halted.load(Ordering::Relaxed);
+                    if halted != 0 {
+                        break;
+                    }
+                    mem::sync();
+                }
+                halted = raw.flags.halted.load(Ordering::Relaxed);
+            }
+
+            if debug_enabled(DebugFlags::NoGpuRecovery) {
+                dev_crit!(self.dev, "  GPU recovery is disabled, wedging forever!\n");
+            } else if halted != 0 {
+                dev_err!(self.dev, "  Attempting recovery...\n");
+                raw.flags.halted.store(0, Ordering::SeqCst);
+                raw.flags.resume.store(1, Ordering::SeqCst);
+            } else {
+                dev_err!(self.dev, "  Cannot recover.\n");
+            }
+        });
+    }
+
+    /// Return the packed GPU enabled core masks.
+    // Only used for some versions
+    #[allow(dead_code)]
+    pub(crate) fn core_masks_packed(&self) -> &[u32] {
+        self.dyncfg.id.core_masks_packed.as_slice()
+    }
+
+    /// Kick a submission pipe for a submitted job to tell the firmware to start processing it.
+    pub(crate) fn run_job(&self, job: workqueue::JobSubmission::ver<'_>) -> Result {
+        mod_dev_dbg!(self.dev, "GPU: run_job\n");
+
+        let pipe_type = job.pipe_type();
+        mod_dev_dbg!(self.dev, "GPU: run_job: pipe_type={:?}\n", pipe_type);
+
+        let pipes = match pipe_type {
+            PipeType::Vertex => &self.pipes.vtx,
+            PipeType::Fragment => &self.pipes.frag,
+            PipeType::Compute => &self.pipes.comp,
+        };
+
+        let index: usize = job.priority() as usize;
+        let mut pipe = pipes.get(index).ok_or(EIO)?.lock();
+
+        mod_dev_dbg!(self.dev, "GPU: run_job: run()\n");
+        job.run(&mut pipe);
+        mod_dev_dbg!(self.dev, "GPU: run_job: ring doorbell\n");
+
+        let mut guard = self.rtkit.lock();
+        let rtk = guard.as_mut().unwrap();
+        rtk.send_message(
+            EP_DOORBELL,
+            MSG_TX_DOORBELL | pipe_type as u64 | ((index as u64) << 2),
+        )?;
+        mod_dev_dbg!(self.dev, "GPU: run_job: done\n");
+
+        Ok(())
+    }
+
+    pub(crate) fn is_crashed(&self) -> bool {
+        self.crashed.load(Ordering::Relaxed)
+    }
+
+    pub(crate) fn start_op(self: &Arc<GpuManager::ver>) -> Result<OpGuard> {
+        if self.is_crashed() {
+            return Err(ENODEV);
+        }
+
+        let val = self
+            .initdata
+            .globals
+            .with(|raw, _inner| raw.pending_submissions.fetch_add(1, Ordering::Acquire));
+
+        mod_dev_dbg!(self.dev, "OP start (pending: {})\n", val + 1);
+        self.kick_firmware()?;
+        Ok(OpGuard(self.clone()))
+    }
+}
+
+#[versions(AGX)]
+impl GpuManager for GpuManager::ver {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn arc_as_any(self: Arc<Self>) -> Arc<dyn Any + Sync + Send> {
+        self as Arc<dyn Any + Sync + Send>
+    }
+
+    fn init(&self) -> Result {
+        self.tx_channels.lock().device_control.send(
+            &fw::channels::DeviceControlMsg::ver::Initialize(Default::default()),
+        );
+
+        let initdata = self.initdata.gpu_va().get();
+        let mut guard = self.rtkit.lock();
+        let rtk = guard.as_mut().unwrap();
+
+        rtk.boot()?;
+        rtk.start_endpoint(EP_FIRMWARE)?;
+        rtk.start_endpoint(EP_DOORBELL)?;
+        rtk.send_message(EP_FIRMWARE, MSG_INIT | (initdata & INIT_DATA_MASK))?;
+        rtk.send_message(EP_DOORBELL, MSG_TX_DOORBELL | DOORBELL_DEVCTRL)?;
+        core::mem::drop(guard);
+
+        self.kick_firmware()?;
+        Ok(())
+    }
+
+    fn update_globals(&self) {
+        let mut timeout: u32 = 2;
+        if debug_enabled(DebugFlags::WaitForPowerOff) {
+            timeout = 0;
+        } else if debug_enabled(DebugFlags::KeepGpuPowered) {
+            timeout = 5000;
+        }
+
+        self.initdata.globals.with(|raw, _inner| {
+            raw.idle_off_delay_ms.store(timeout, Ordering::Relaxed);
+        });
+    }
+
+    fn alloc(&self) -> Guard<'_, Mutex<KernelAllocators>> {
+        let mut guard = self.alloc.lock();
+        let (garbage_count, garbage_bytes) = guard.private.garbage();
+        if garbage_bytes > 1024 * 1024 {
+            mod_dev_dbg!(
+                self.dev,
+                "Collecting kalloc garbage ({} objects, {} bytes)\n",
+                garbage_count,
+                garbage_bytes
+            );
+            if self.flush_fw_cache().is_err() {
+                dev_err!(self.dev, "Failed to flush FW cache\n");
+            } else {
+                guard.private.collect_garbage(garbage_count);
+            }
+        }
+
+        guard
+    }
+
+    fn new_vm(&self, file_id: u64) -> Result<mmu::Vm> {
+        self.uat.new_vm(self.ids.vm.next(), file_id)
+    }
+
+    fn bind_vm(&self, vm: &mmu::Vm) -> Result<mmu::VmBind> {
+        self.uat.bind(vm)
+    }
+
+    fn new_queue(
+        &self,
+        vm: mmu::Vm,
+        ualloc: Arc<Mutex<alloc::DefaultAllocator>>,
+        ualloc_priv: Arc<Mutex<alloc::DefaultAllocator>>,
+        priority: u32,
+        caps: u32,
+    ) -> Result<Box<dyn queue::Queue>> {
+        let mut kalloc = self.alloc();
+        let id = self.ids.queue.next();
+        Ok(Box::try_new(queue::Queue::ver::new(
+            &self.dev,
+            vm,
+            &mut kalloc,
+            ualloc,
+            ualloc_priv,
+            self.event_manager.clone(),
+            &self.buffer_mgr,
+            id,
+            priority,
+            caps,
+        )?)?)
+    }
+
+    fn kick_firmware(&self) -> Result {
+        if self.is_crashed() {
+            return Err(ENODEV);
+        }
+
+        let mut guard = self.rtkit.lock();
+        let rtk = guard.as_mut().unwrap();
+        rtk.send_message(EP_DOORBELL, MSG_TX_DOORBELL | DOORBELL_KICKFW)?;
+
+        Ok(())
+    }
+
+    fn invalidate_context(
+        &self,
+        context: &fw::types::GpuObject<fw::workqueue::GpuContextData>,
+    ) -> Result {
+        mod_dev_dbg!(
+            self.dev,
+            "Invalidating GPU context @ {:?}\n",
+            context.weak_pointer()
+        );
+
+        if self.is_crashed() {
+            return Err(ENODEV);
+        }
+
+        let mut guard = self.alloc.lock();
+        let (garbage_count, _) = guard.private.garbage();
+
+        let dc = context.with(
+            |raw, _inner| fw::channels::DeviceControlMsg::ver::DestroyContext {
+                unk_4: 0,
+                ctx_23: raw.unk_23,
+                __pad0: Default::default(),
+                unk_c: 0,
+                unk_10: 0,
+                ctx_0: raw.unk_0,
+                ctx_1: raw.unk_1,
+                ctx_4: raw.unk_4,
+                __pad1: Default::default(),
+                unk_18: 0,
+                gpu_context: Some(context.weak_pointer()),
+                __pad2: Default::default(),
+            },
+        );
+
+        mod_dev_dbg!(self.dev, "Context invalidation command: {:?}\n", &dc);
+
+        let mut txch = self.tx_channels.lock();
+
+        let token = txch.device_control.send(&dc);
+
+        {
+            let mut guard = self.rtkit.lock();
+            let rtk = guard.as_mut().unwrap();
+            rtk.send_message(EP_DOORBELL, MSG_TX_DOORBELL | DOORBELL_DEVCTRL)?;
+        }
+
+        txch.device_control.wait_for(token)?;
+
+        mod_dev_dbg!(
+            self.dev,
+            "GPU context invalidated: {:?}\n",
+            context.weak_pointer()
+        );
+
+        // The invalidation does a cache flush, so it is okay to collect garbage
+        guard.private.collect_garbage(garbage_count);
+
+        Ok(())
+    }
+
+    fn flush_fw_cache(&self) -> Result {
+        mod_dev_dbg!(self.dev, "Flushing coprocessor data cache\n");
+
+        if self.is_crashed() {
+            return Err(ENODEV);
+        }
+
+        // ctx_0 == 0xff or ctx_1 == 0xff cause no effect on context,
+        // but this command does a full cache flush too, so abuse it
+        // for that.
+
+        let dc = fw::channels::DeviceControlMsg::ver::DestroyContext {
+            unk_4: 0,
+            ctx_23: 0,
+            __pad0: Default::default(),
+            unk_c: 0,
+            unk_10: 0,
+            ctx_0: 0xff,
+            ctx_1: 0xff,
+            ctx_4: 0,
+            __pad1: Default::default(),
+            unk_18: 0,
+            gpu_context: None,
+            __pad2: Default::default(),
+        };
+
+        let mut txch = self.tx_channels.lock();
+
+        let token = txch.device_control.send(&dc);
+        {
+            let mut guard = self.rtkit.lock();
+            let rtk = guard.as_mut().unwrap();
+            rtk.send_message(EP_DOORBELL, MSG_TX_DOORBELL | DOORBELL_DEVCTRL)?;
+        }
+
+        txch.device_control.wait_for(token)?;
+        Ok(())
+    }
+
+    fn ids(&self) -> &SequenceIDs {
+        &self.ids
+    }
+
+    fn handle_timeout(&self, counter: u32, event_slot: u32) {
+        dev_err!(self.dev, " (\\________/) \n");
+        dev_err!(self.dev, "  |        |  \n");
+        dev_err!(self.dev, "'.| \\  , / |.'\n");
+        dev_err!(self.dev, "--| / (( \\ |--\n");
+        dev_err!(self.dev, ".'|  _-_-  |'.\n");
+        dev_err!(self.dev, "  |________|  \n");
+        dev_err!(self.dev, "** GPU timeout nya~!!!!! **\n");
+        dev_err!(self.dev, "  Event slot: {}\n", event_slot);
+        dev_err!(self.dev, "  Timeout count: {}\n", counter);
+
+        // If we have fault info, consider it a fault.
+        let error = match self.get_fault_info() {
+            Some(info) => workqueue::WorkError::Fault(info),
+            None => workqueue::WorkError::Timeout,
+        };
+        self.mark_pending_events(Some(event_slot), error);
+        self.recover();
+    }
+
+    fn handle_fault(&self) {
+        dev_err!(self.dev, " (\\________/) \n");
+        dev_err!(self.dev, "  |        |  \n");
+        dev_err!(self.dev, "'.| \\  , / |.'\n");
+        dev_err!(self.dev, "--| / (( \\ |--\n");
+        dev_err!(self.dev, ".'|  _-_-  |'.\n");
+        dev_err!(self.dev, "  |________|  \n");
+        dev_err!(self.dev, "GPU fault nya~!!!!!\n");
+        let error = match self.get_fault_info() {
+            Some(info) => workqueue::WorkError::Fault(info),
+            None => workqueue::WorkError::Unknown,
+        };
+        self.mark_pending_events(None, error);
+        self.recover();
+    }
+
+    fn wait_for_poweroff(&self, timeout: usize) -> Result {
+        self.initdata.runtime_pointers.hwdata_a.with(|raw, _inner| {
+            for _i in 0..timeout {
+                if raw.pwr_status.load(Ordering::Relaxed) == 4 {
+                    return Ok(());
+                }
+                coarse_sleep(Duration::from_millis(1));
+            }
+            Err(ETIMEDOUT)
+        })
+    }
+
+    fn fwctl(&self, msg: fw::channels::FwCtlMsg) -> Result {
+        if self.is_crashed() {
+            return Err(ENODEV);
+        }
+
+        let mut fwctl = self.fwctl_channel.lock();
+        let token = fwctl.send(&msg);
+        {
+            let mut guard = self.rtkit.lock();
+            let rtk = guard.as_mut().unwrap();
+            rtk.send_message(EP_DOORBELL, MSG_FWCTL)?;
+        }
+        fwctl.wait_for(token)?;
+        Ok(())
+    }
+
+    fn get_cfg(&self) -> &'static hw::HwConfig {
+        self.cfg
+    }
+
+    fn get_dyncfg(&self) -> &hw::DynConfig {
+        &self.dyncfg
+    }
+}
+
+#[versions(AGX)]
+impl GpuManagerPriv for GpuManager::ver {
+    fn end_op(&self) {
+        let val = self
+            .initdata
+            .globals
+            .with(|raw, _inner| raw.pending_submissions.fetch_sub(1, Ordering::Release));
+
+        mod_dev_dbg!(self.dev, "OP end (pending: {})\n", val - 1);
+    }
+}
diff --git a/drivers/gpu/drm/asahi/hw/mod.rs b/drivers/gpu/drm/asahi/hw/mod.rs
new file mode 100644
index 000000000000..a92bb70aeae8
--- /dev/null
+++ b/drivers/gpu/drm/asahi/hw/mod.rs
@@ -0,0 +1,522 @@
+// SPDX-License-Identifier: GPL-2.0-only OR MIT
+
+//! Per-SoC hardware configuration structures
+//!
+//! This module contains the definitions used to store per-GPU and per-SoC configuration data.
+
+use crate::driver::AsahiDevice;
+use crate::fw::types::*;
+use alloc::vec::Vec;
+use kernel::c_str;
+use kernel::device::RawDevice;
+use kernel::prelude::*;
+
+const MAX_POWERZONES: usize = 5;
+
+pub(crate) mod t600x;
+pub(crate) mod t8103;
+pub(crate) mod t8112;
+
+/// GPU generation enumeration. Note: Part of the UABI.
+#[derive(Debug, PartialEq, Copy, Clone)]
+#[repr(u32)]
+pub(crate) enum GpuGen {
+    G13 = 13,
+    G14 = 14,
+}
+
+/// GPU variant enumeration. Note: Part of the UABI.
+#[derive(Debug, PartialEq, Copy, Clone)]
+#[repr(u32)]
+pub(crate) enum GpuVariant {
+    P = 'P' as u32,
+    G = 'G' as u32,
+    S = 'S' as u32,
+    C = 'C' as u32,
+    D = 'D' as u32,
+}
+
+/// GPU revision enumeration. Note: Part of the UABI.
+#[derive(Debug, PartialEq, Copy, Clone)]
+#[repr(u32)]
+pub(crate) enum GpuRevision {
+    A0 = 0x00,
+    A1 = 0x01,
+    B0 = 0x10,
+    B1 = 0x11,
+    C0 = 0x20,
+    C1 = 0x21,
+}
+
+/// GPU core type enumeration. Note: Part of the firmware ABI.
+#[derive(Debug, Copy, Clone)]
+#[repr(u32)]
+pub(crate) enum GpuCore {
+    // Unknown = 0,
+    // G5P = 1,
+    // G5G = 2,
+    // G9P = 3,
+    // G9G = 4,
+    // G10P = 5,
+    // G11P = 6,
+    // G11M = 7,
+    // G11G = 8,
+    // G12P = 9,
+    // G13P = 10,
+    G13G = 11,
+    G13S = 12,
+    G13C = 13,
+    // G14P = 14,
+    G14G = 15,
+}
+
+/// GPU revision ID. Note: Part of the firmware ABI.
+#[derive(Debug, PartialEq, Copy, Clone)]
+#[repr(u32)]
+pub(crate) enum GpuRevisionID {
+    // Unknown = 0,
+    A0 = 1,
+    A1 = 2,
+    B0 = 3,
+    B1 = 4,
+    C0 = 5,
+    C1 = 6,
+}
+
+/// GPU driver/hardware features, from the UABI.
+pub(crate) mod feat {
+    /// Backwards-compatible features.
+    pub(crate) mod compat {}
+
+    /// Backwards-incompatible features.
+    pub(crate) mod incompat {
+        use kernel::bindings;
+
+        /// Hardware requires Z/S compression to be mandatorily enabled.
+        pub(crate) const MANDATORY_ZS_COMPRESSION: u64 =
+            bindings::drm_asahi_feat_incompat_DRM_ASAHI_FEAT_MANDATORY_ZS_COMPRESSION as u64;
+    }
+}
+
+/// A single performance state of the GPU.
+#[derive(Debug)]
+pub(crate) struct PState {
+    /// Voltage in millivolts, per GPU cluster.
+    pub(crate) volt_mv: Vec<u32>,
+    /// Frequency in hertz.
+    pub(crate) freq_hz: u32,
+    /// Maximum power consumption of the GPU at this pstate, in milliwatts.
+    pub(crate) pwr_mw: u32,
+}
+
+/// A power zone definition (we have no idea what this is but Apple puts them in the DT).
+#[allow(missing_docs)]
+#[derive(Debug, Copy, Clone)]
+pub(crate) struct PowerZone {
+    pub(crate) target: u32,
+    pub(crate) target_offset: u32,
+    pub(crate) filter_tc: u32,
+}
+
+/// An MMIO mapping used by the firmware.
+#[derive(Debug, Copy, Clone)]
+pub(crate) struct IOMapping {
+    /// Base physical address of the mapping.
+    pub(crate) base: usize,
+    /// Size of the mapping.
+    pub(crate) size: usize,
+    /// Range size of the mapping (for arrays?)
+    pub(crate) range_size: usize,
+    /// Whether the mapping should be writable.
+    pub(crate) writable: bool,
+}
+
+impl IOMapping {
+    /// Convenience constructor for a new IOMapping.
+    pub(crate) const fn new(
+        base: usize,
+        size: usize,
+        range_size: usize,
+        writable: bool,
+    ) -> IOMapping {
+        IOMapping {
+            base,
+            size,
+            range_size,
+            writable,
+        }
+    }
+}
+
+/// Unknown HwConfigA fields that vary from SoC to SoC.
+#[allow(missing_docs)]
+#[derive(Debug, Copy, Clone)]
+pub(crate) struct HwConfigA {
+    pub(crate) unk_87c: i32,
+    pub(crate) unk_8cc: u32,
+    pub(crate) unk_e24: u32,
+}
+
+/// Unknown HwConfigB fields that vary from SoC to SoC.
+#[allow(missing_docs)]
+#[derive(Debug, Copy, Clone)]
+pub(crate) struct HwConfigB {
+    pub(crate) unk_4e0: u64,
+    pub(crate) unk_534: u32,
+    pub(crate) unk_ab8: u32,
+    pub(crate) unk_abc: u32,
+    pub(crate) unk_b30: u32,
+}
+
+/// Render command configs that vary from SoC to SoC.
+#[derive(Debug, Copy, Clone)]
+pub(crate) struct HwRenderConfig {
+    /// Vertex/tiling-related configuration register (lsb: disable clustering)
+    pub(crate) tiling_control: u32,
+}
+
+/// Static hardware configuration for a given SoC model.
+#[derive(Debug)]
+pub(crate) struct HwConfig {
+    /// Chip ID in hex format (e.g. 0x8103 for t8103).
+    pub(crate) chip_id: u32,
+    /// GPU generation.
+    pub(crate) gpu_gen: GpuGen,
+    /// GPU variant type.
+    pub(crate) gpu_variant: GpuVariant,
+    /// GPU core type ID (as known by the firmware).
+    pub(crate) gpu_core: GpuCore,
+    /// Compatible feature bitmask for this GPU.
+    pub(crate) gpu_feat_compat: u64,
+    /// Incompatible feature bitmask for this GPU.
+    pub(crate) gpu_feat_incompat: u64,
+
+    /// Base clock used used for timekeeping.
+    pub(crate) base_clock_hz: u32,
+    /// Output address space for the UAT on this SoC.
+    pub(crate) uat_oas: usize,
+    /// Maximum number of clusters on this SoC.
+    pub(crate) max_num_clusters: u32,
+    /// Maximum number of cores per cluster for this GPU.
+    pub(crate) max_num_cores: u32,
+    /// Maximum number of frags per cluster for this GPU.
+    pub(crate) max_num_frags: u32,
+    /// Maximum number of GPs per cluster for this GPU.
+    pub(crate) max_num_gps: u32,
+
+    /// Required size of the first preemption buffer.
+    pub(crate) preempt1_size: usize,
+    /// Required size of the second preemption buffer.
+    pub(crate) preempt2_size: usize,
+    /// Required size of the third preemption buffer.
+    pub(crate) preempt3_size: usize,
+
+    /// Rendering-relevant configuration.
+    pub(crate) render: HwRenderConfig,
+
+    /// Misc HWDataA field values.
+    pub(crate) da: HwConfigA,
+    /// Misc HWDataB field values.
+    pub(crate) db: HwConfigB,
+    /// HwDataShared1.table.
+    pub(crate) shared1_tab: &'static [i32],
+    /// HwDataShared1.unk_a4.
+    pub(crate) shared1_a4: u32,
+    /// HwDataShared2.table.
+    pub(crate) shared2_tab: &'static [i32],
+    /// HwDataShared2.unk_508.
+    pub(crate) shared2_unk_508: u32,
+    /// Constant related to SRAM voltages.
+    pub(crate) sram_k: F32,
+    /// Unknown per-cluster coefficients 1.
+    pub(crate) unk_coef_a: &'static [&'static [F32]],
+    /// Unknown per-cluster coefficients 2.
+    pub(crate) unk_coef_b: &'static [&'static [F32]],
+    /// Unknown table in Global struct.
+    pub(crate) global_tab: Option<&'static [u8]>,
+
+    /// Temperature sensor list (8 bits per sensor).
+    pub(crate) fast_die0_sensor_mask: u64,
+    /// Temperature sensor list (alternate).
+    pub(crate) fast_die0_sensor_mask_alt: u64,
+    /// Temperature sensor present bitmask.
+    pub(crate) fast_die0_sensor_present: u32,
+    /// Required MMIO mappings for this GPU/firmware.
+    pub(crate) io_mappings: &'static [Option<IOMapping>],
+}
+
+/// Dynamic (fetched from hardware/DT) configuration.
+#[derive(Debug)]
+pub(crate) struct DynConfig {
+    /// Base physical address of the UAT TTB (from DT reserved memory region).
+    pub(crate) uat_ttb_base: u64,
+    /// GPU ID configuration read from hardware.
+    pub(crate) id: GpuIdConfig,
+    /// Power calibration configuration for this specific chip/device.
+    pub(crate) pwr: PwrConfig,
+}
+
+/// Specific GPU ID configuration fetched from SGX MMIO registers.
+#[derive(Debug)]
+pub(crate) struct GpuIdConfig {
+    /// GPU generation (should match static config).
+    pub(crate) gpu_gen: GpuGen,
+    /// GPU variant type (should match static config).
+    pub(crate) gpu_variant: GpuVariant,
+    /// GPU silicon revision.
+    pub(crate) gpu_rev: GpuRevision,
+    /// GPU silicon revision ID (firmware enum).
+    pub(crate) gpu_rev_id: GpuRevisionID,
+    /// Maximum number of dies supported.
+    pub(crate) max_dies: u32,
+    /// Total number of GPU clusters.
+    pub(crate) num_clusters: u32,
+    /// Maximum number of GPU cores per cluster.
+    pub(crate) num_cores: u32,
+    /// Number of frags per cluster.
+    pub(crate) num_frags: u32,
+    /// Number of GPs per cluster.
+    pub(crate) num_gps: u32,
+    /// Total number of active cores for the whole GPU.
+    pub(crate) total_active_cores: u32,
+    /// Mask of active cores per cluster.
+    pub(crate) core_masks: Vec<u32>,
+    /// Packed mask of all active cores.
+    pub(crate) core_masks_packed: Vec<u32>,
+}
+
+/// Configurable GPU power settings from the device tree.
+#[derive(Debug)]
+pub(crate) struct PwrConfig {
+    /// GPU performance state list.
+    pub(crate) perf_states: Vec<PState>,
+    /// GPU power zone list.
+    pub(crate) power_zones: Vec<PowerZone>,
+
+    /// Core leakage coefficient per cluster.
+    pub(crate) core_leak_coef: Vec<F32>,
+    /// SRAM leakage coefficient per cluster.
+    pub(crate) sram_leak_coef: Vec<F32>,
+
+    /// Maximum total power of the GPU in milliwatts.
+    pub(crate) max_power_mw: u32,
+    /// Maximum frequency of the GPU in megahertz.
+    pub(crate) max_freq_mhz: u32,
+
+    /// Minimum performance state to start at.
+    pub(crate) perf_base_pstate: u32,
+    /// Maximum enabled performance state.
+    pub(crate) perf_max_pstate: u32,
+
+    /// Minimum voltage for the SRAM power domain in microvolts.
+    pub(crate) min_sram_microvolt: u32,
+
+    // Most of these fields are just named after Apple ADT property names and we don't fully
+    // understand them. They configure various power-related PID loops and filters.
+    /// Average power filter time constant in milliseconds.
+    pub(crate) avg_power_filter_tc_ms: u32,
+    /// Average power filter PID integral gain?
+    pub(crate) avg_power_ki_only: F32,
+    /// Average power filter PID proportional gain?
+    pub(crate) avg_power_kp: F32,
+    pub(crate) avg_power_min_duty_cycle: u32,
+    /// Average power target filter time constant in periods.
+    pub(crate) avg_power_target_filter_tc: u32,
+    /// "Fast die0" (temperature?) PID integral gain.
+    pub(crate) fast_die0_integral_gain: F32,
+    /// "Fast die0" (temperature?) PID proportional gain.
+    pub(crate) fast_die0_proportional_gain: F32,
+    pub(crate) fast_die0_prop_tgt_delta: u32,
+    pub(crate) fast_die0_release_temp: u32,
+    /// Delay from the fender (?) becoming idle to powerdown
+    pub(crate) fender_idle_off_delay_ms: u32,
+    /// Timeout from firmware early wake to sleep if no work was submitted (?)
+    pub(crate) fw_early_wake_timeout_ms: u32,
+    /// Delay from the GPU becoming idle to powerdown
+    pub(crate) idle_off_delay_ms: u32,
+    /// Percent?
+    pub(crate) perf_boost_ce_step: u32,
+    /// Minimum utilization before performance state is increased in %.
+    pub(crate) perf_boost_min_util: u32,
+    pub(crate) perf_filter_drop_threshold: u32,
+    /// Performance PID filter time constant? (periods?)
+    pub(crate) perf_filter_time_constant: u32,
+    /// Performance PID filter time constant 2? (periods?)
+    pub(crate) perf_filter_time_constant2: u32,
+    /// Performance PID integral gain.
+    pub(crate) perf_integral_gain: F32,
+    /// Performance PID integral gain 2 (?).
+    pub(crate) perf_integral_gain2: F32,
+    pub(crate) perf_integral_min_clamp: u32,
+    /// Performance PID proportional gain.
+    pub(crate) perf_proportional_gain: F32,
+    /// Performance PID proportional gain 2 (?).
+    pub(crate) perf_proportional_gain2: F32,
+    pub(crate) perf_reset_iters: u32,
+    /// Target GPU utilization for the performance controller in %.
+    pub(crate) perf_tgt_utilization: u32,
+    /// Power sampling period in milliseconds.
+    pub(crate) power_sample_period: u32,
+    /// PPM (?) filter time constant in milliseconds.
+    pub(crate) ppm_filter_time_constant_ms: u32,
+    /// PPM (?) filter PID integral gain.
+    pub(crate) ppm_ki: F32,
+    /// PPM (?) filter PID proportional gain.
+    pub(crate) ppm_kp: F32,
+    /// Power consumption filter time constant (periods?)
+    pub(crate) pwr_filter_time_constant: u32,
+    /// Power consumption filter PID integral gain.
+    pub(crate) pwr_integral_gain: F32,
+    pub(crate) pwr_integral_min_clamp: u32,
+    pub(crate) pwr_min_duty_cycle: u32,
+    pub(crate) pwr_proportional_gain: F32,
+}
+
+impl PwrConfig {
+    /// Load the GPU power configuration from the device tree.
+    pub(crate) fn load(dev: &AsahiDevice, cfg: &HwConfig) -> Result<PwrConfig> {
+        let mut perf_states = Vec::new();
+
+        let node = dev.of_node().ok_or(EIO)?;
+        let opps = node
+            .parse_phandle(c_str!("operating-points-v2"), 0)
+            .ok_or(EIO)?;
+
+        let mut max_power_mw: u32 = 0;
+        let mut max_freq_mhz: u32 = 0;
+
+        macro_rules! prop {
+            ($prop:expr, $default:expr) => {{
+                node.get_opt_property(c_str!($prop))
+                    .map_err(|e| {
+                        dev_err!(dev, "Error reading property {}: {:?}\n", $prop, e);
+                        e
+                    })?
+                    .unwrap_or($default)
+            }};
+            ($prop:expr) => {{
+                node.get_property(c_str!($prop)).map_err(|e| {
+                    dev_err!(dev, "Error reading property {}: {:?}\n", $prop, e);
+                    e
+                })?
+            }};
+        }
+
+        for opp in opps.children() {
+            let freq_hz: u64 = opp.get_property(c_str!("opp-hz"))?;
+            let mut volt_uv: Vec<u32> = opp.get_property(c_str!("opp-microvolt"))?;
+            let pwr_uw: u32 = opp.get_property(c_str!("opp-microwatt"))?;
+
+            if volt_uv.len() != cfg.max_num_clusters as usize {
+                dev_err!(
+                    dev,
+                    "Invalid opp-microvolt length (expected {}, got {})\n",
+                    cfg.max_num_clusters,
+                    volt_uv.len()
+                );
+                return Err(EINVAL);
+            }
+
+            volt_uv.iter_mut().for_each(|a| *a /= 1000);
+            let volt_mv = volt_uv;
+
+            let pwr_mw = pwr_uw / 1000;
+            max_power_mw = max_power_mw.max(pwr_mw);
+
+            let freq_mhz: u32 = (freq_hz / 1_000_000).try_into()?;
+            max_freq_mhz = max_freq_mhz.max(freq_mhz);
+
+            perf_states.try_push(PState {
+                freq_hz: freq_hz.try_into()?,
+                volt_mv,
+                pwr_mw,
+            })?;
+        }
+
+        let pz_data = prop!("apple,power-zones", Vec::new());
+
+        if pz_data.len() > 3 * MAX_POWERZONES || pz_data.len() % 3 != 0 {
+            dev_err!(dev, "Invalid apple,power-zones value\n");
+            return Err(EINVAL);
+        }
+
+        let pz_count = pz_data.len() / 3;
+        let mut power_zones = Vec::new();
+        for i in (0..pz_count).step_by(3) {
+            power_zones.try_push(PowerZone {
+                target: pz_data[i],
+                target_offset: pz_data[i + 1],
+                filter_tc: pz_data[i + 2],
+            })?;
+        }
+
+        let core_leak_coef: Vec<F32> = prop!("apple,core-leak-coef");
+        let sram_leak_coef: Vec<F32> = prop!("apple,sram-leak-coef");
+
+        if core_leak_coef.len() != cfg.max_num_clusters as usize {
+            dev_err!(dev, "Invalid apple,core-leak-coef\n");
+            return Err(EINVAL);
+        }
+        if sram_leak_coef.len() != cfg.max_num_clusters as usize {
+            dev_err!(dev, "Invalid apple,sram_leak_coef\n");
+            return Err(EINVAL);
+        }
+
+        Ok(PwrConfig {
+            core_leak_coef,
+            sram_leak_coef,
+
+            max_power_mw,
+            max_freq_mhz,
+
+            perf_base_pstate: prop!("apple,perf-base-pstate", 1),
+            perf_max_pstate: perf_states.len() as u32 - 1,
+            min_sram_microvolt: prop!("apple,min-sram-microvolt"),
+
+            avg_power_filter_tc_ms: prop!("apple,avg-power-filter-tc-ms"),
+            avg_power_ki_only: prop!("apple,avg-power-ki-only"),
+            avg_power_kp: prop!("apple,avg-power-kp"),
+            avg_power_min_duty_cycle: prop!("apple,avg-power-min-duty-cycle"),
+            avg_power_target_filter_tc: prop!("apple,avg-power-target-filter-tc"),
+            fast_die0_integral_gain: prop!("apple,fast-die0-integral-gain"),
+            fast_die0_proportional_gain: prop!("apple,fast-die0-proportional-gain"),
+            fast_die0_prop_tgt_delta: prop!("apple,fast-die0-prop-tgt-delta", 0),
+            fast_die0_release_temp: prop!("apple,fast-die0-release-temp", 80),
+            fender_idle_off_delay_ms: prop!("apple,fender-idle-off-delay-ms", 40),
+            fw_early_wake_timeout_ms: prop!("apple,fw-early-wake-timeout-ms", 5),
+            idle_off_delay_ms: prop!("apple,idle-off-delay-ms", 2),
+            perf_boost_ce_step: prop!("apple,perf-boost-ce-step", 25),
+            perf_boost_min_util: prop!("apple,perf-boost-min-util", 100),
+            perf_filter_drop_threshold: prop!("apple,perf-filter-drop-threshold"),
+            perf_filter_time_constant2: prop!("apple,perf-filter-time-constant2"),
+            perf_filter_time_constant: prop!("apple,perf-filter-time-constant"),
+            perf_integral_gain2: prop!("apple,perf-integral-gain2"),
+            perf_integral_gain: prop!("apple,perf-integral-gain", f32!(7.8956833)),
+            perf_integral_min_clamp: prop!("apple,perf-integral-min-clamp"),
+            perf_proportional_gain2: prop!("apple,perf-proportional-gain2"),
+            perf_proportional_gain: prop!("apple,perf-proportional-gain", f32!(14.707963)),
+            perf_reset_iters: prop!("apple,perf-reset-iters", 6),
+            perf_tgt_utilization: prop!("apple,perf-tgt-utilization"),
+            power_sample_period: prop!("apple,power-sample-period"),
+            ppm_filter_time_constant_ms: prop!("apple,ppm-filter-time-constant-ms"),
+            ppm_ki: prop!("apple,ppm-ki"),
+            ppm_kp: prop!("apple,ppm-kp"),
+            pwr_filter_time_constant: prop!("apple,pwr-filter-time-constant", 313),
+            pwr_integral_gain: prop!("apple,pwr-integral-gain", f32!(0.0202129)),
+            pwr_integral_min_clamp: prop!("apple,pwr-integral-min-clamp", 0),
+            pwr_min_duty_cycle: prop!("apple,pwr-min-duty-cycle"),
+            pwr_proportional_gain: prop!("apple,pwr-proportional-gain", f32!(5.2831855)),
+
+            perf_states,
+            power_zones,
+        })
+    }
+
+    pub(crate) fn min_frequency_khz(&self) -> u32 {
+        self.perf_states[self.perf_base_pstate as usize].freq_hz / 1000
+    }
+
+    pub(crate) fn max_frequency_khz(&self) -> u32 {
+        self.perf_states[self.perf_max_pstate as usize].freq_hz / 1000
+    }
+}
diff --git a/drivers/gpu/drm/asahi/hw/t600x.rs b/drivers/gpu/drm/asahi/hw/t600x.rs
new file mode 100644
index 000000000000..8a8267a7e18a
--- /dev/null
+++ b/drivers/gpu/drm/asahi/hw/t600x.rs
@@ -0,0 +1,140 @@
+// SPDX-License-Identifier: GPL-2.0-only OR MIT
+
+//! Hardware configuration for t600x (M1 Pro/Max/Ultra) platforms.
+
+use crate::f32;
+
+use super::*;
+
+const fn iomaps(mcc_count: usize, has_die1: bool) -> [Option<IOMapping>; 20] {
+    [
+        Some(IOMapping::new(0x404d00000, 0x1c000, 0x1c000, true)), // Fender
+        Some(IOMapping::new(0x20e100000, 0x4000, 0x4000, false)),  // AICTimer
+        Some(IOMapping::new(0x28e104000, 0x4000, 0x4000, true)),   // AICSWInt
+        Some(IOMapping::new(0x404000000, 0x20000, 0x20000, true)), // RGX
+        None,                                                      // UVD
+        None,                                                      // unused
+        None,                                                      // DisplayUnderrunWA
+        Some(IOMapping::new(0x28e494000, 0x1000, 0x1000, false)),  // AnalogTempSensorControllerRegs
+        None,                                                      // PMPDoorbell
+        Some(IOMapping::new(0x404d80000, 0x8000, 0x8000, true)),   // MetrologySensorRegs
+        Some(IOMapping::new(0x204d61000, 0x1000, 0x1000, true)),   // GMGIFAFRegs
+        Some(IOMapping::new(
+            0x200000000,
+            mcc_count * 0xd8000,
+            0xd6400,
+            true,
+        )), // MCache registers
+        None,                                                      // AICBankedRegisters
+        None,                                                      // PMGRScratch
+        Some(IOMapping::new(0x2643c4000, 0x1000, 0x1000, true)), // NIA Special agent idle register die 0
+        if has_die1 {
+            // NIA Special agent idle register die 1
+            Some(IOMapping::new(0x22643c4000, 0x1000, 0x1000, true))
+        } else {
+            None
+        },
+        None,                                                     // CRE registers
+        None,                                                     // Streaming codec registers
+        Some(IOMapping::new(0x28e3d0000, 0x1000, 0x1000, true)),  // ?
+        Some(IOMapping::new(0x28e3c0000, 0x1000, 0x1000, false)), // ?
+    ]
+}
+
+pub(crate) const HWCONFIG_T6002: super::HwConfig = HwConfig {
+    chip_id: 0x6002,
+    gpu_gen: GpuGen::G13,
+    gpu_variant: GpuVariant::D,
+    gpu_core: GpuCore::G13C,
+    gpu_feat_compat: 0,
+    gpu_feat_incompat: feat::incompat::MANDATORY_ZS_COMPRESSION,
+
+    base_clock_hz: 24_000_000,
+    uat_oas: 42,
+    max_num_clusters: 8,
+    max_num_cores: 8,
+    max_num_frags: 8,
+    max_num_gps: 4,
+
+    preempt1_size: 0x540,
+    preempt2_size: 0x280,
+    preempt3_size: 0x20,
+
+    render: HwRenderConfig {
+        tiling_control: 0xa540,
+    },
+
+    da: HwConfigA {
+        unk_87c: 900,
+        unk_8cc: 11000,
+        unk_e24: 125,
+    },
+    db: HwConfigB {
+        unk_4e0: 4,
+        unk_534: 1,
+        unk_ab8: 0x2084,
+        unk_abc: 0x80,
+        unk_b30: 0,
+    },
+    shared1_tab: &[
+        0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+        0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    ],
+    shared1_a4: 0xffff,
+    shared2_tab: &[-1, -1, -1, -1, 0x2aa, 0xaaa, -1, -1, 0, 0],
+    shared2_unk_508: 0xcc00001,
+    sram_k: f32!(1.02),
+    unk_coef_a: &[
+        &f32!([9.838]),
+        &f32!([9.819]),
+        &f32!([9.826]),
+        &f32!([9.799]),
+        &f32!([9.799]),
+        &f32!([9.826]),
+        &f32!([9.819]),
+        &f32!([9.838]),
+    ],
+    unk_coef_b: &[
+        &f32!([13.0]),
+        &f32!([13.0]),
+        &f32!([13.0]),
+        &f32!([13.0]),
+        &f32!([13.0]),
+        &f32!([13.0]),
+        &f32!([13.0]),
+        &f32!([13.0]),
+    ],
+    global_tab: Some(&[
+        0, 1, 2, 1, 1, 90, 75, 1, 1, 1, 2, 90, 75, 1, 1, 1, 1, 90, 75, 1, 1,
+    ]),
+    fast_die0_sensor_mask: 0x8080808080808080,
+    fast_die0_sensor_mask_alt: 0x9090909090909090,
+    fast_die0_sensor_present: 0xff,
+    io_mappings: &iomaps(16, true),
+};
+
+pub(crate) const HWCONFIG_T6001: super::HwConfig = HwConfig {
+    chip_id: 0x6001,
+    gpu_variant: GpuVariant::C,
+    gpu_core: GpuCore::G13C,
+
+    max_num_clusters: 4,
+    fast_die0_sensor_mask: 0x80808080,
+    fast_die0_sensor_mask_alt: 0x90909090,
+    fast_die0_sensor_present: 0x0f,
+    io_mappings: &iomaps(8, false),
+    ..HWCONFIG_T6002
+};
+
+pub(crate) const HWCONFIG_T6000: super::HwConfig = HwConfig {
+    chip_id: 0x6000,
+    gpu_variant: GpuVariant::S,
+    gpu_core: GpuCore::G13S,
+
+    max_num_clusters: 2,
+    fast_die0_sensor_mask: 0x8080,
+    fast_die0_sensor_mask_alt: 0x9090,
+    fast_die0_sensor_present: 0x03,
+    io_mappings: &iomaps(4, false),
+    ..HWCONFIG_T6001
+};
diff --git a/drivers/gpu/drm/asahi/hw/t8103.rs b/drivers/gpu/drm/asahi/hw/t8103.rs
new file mode 100644
index 000000000000..3d38b088a0f5
--- /dev/null
+++ b/drivers/gpu/drm/asahi/hw/t8103.rs
@@ -0,0 +1,80 @@
+// SPDX-License-Identifier: GPL-2.0-only OR MIT
+
+//! Hardware configuration for t8103 platforms (M1).
+
+use crate::f32;
+
+use super::*;
+
+pub(crate) const HWCONFIG: super::HwConfig = HwConfig {
+    chip_id: 0x8103,
+    gpu_gen: GpuGen::G13,
+    gpu_variant: GpuVariant::G,
+    gpu_core: GpuCore::G13G,
+    gpu_feat_compat: 0,
+    gpu_feat_incompat: 0,
+
+    base_clock_hz: 24_000_000,
+    uat_oas: 40,
+    max_num_clusters: 1,
+    max_num_cores: 8,
+    max_num_frags: 8,
+    max_num_gps: 4,
+
+    preempt1_size: 0x540,
+    preempt2_size: 0x280,
+    preempt3_size: 0x20,
+
+    render: HwRenderConfig {
+        // bit 0: disable clustering (always)
+        tiling_control: 0xa041,
+    },
+
+    da: HwConfigA {
+        unk_87c: -220,
+        unk_8cc: 9880,
+        unk_e24: 112,
+    },
+    db: HwConfigB {
+        unk_4e0: 0,
+        unk_534: 0,
+        unk_ab8: 0x48,
+        unk_abc: 0x8,
+        unk_b30: 0,
+    },
+    shared1_tab: &[
+        -1, 0x7282, 0x50ea, 0x370a, 0x25be, 0x1c1f, 0x16fb, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+    ],
+    shared1_a4: 0xffff,
+    shared2_tab: &[0x800, 0x1555, -1, -1, -1, -1, -1, -1, 0, 0],
+    shared2_unk_508: 0xc00007,
+    sram_k: f32!(1.02),
+    unk_coef_a: &[],
+    unk_coef_b: &[],
+    global_tab: None,
+    fast_die0_sensor_mask: 0x12,
+    fast_die0_sensor_mask_alt: 0x12,
+    fast_die0_sensor_present: 0x01,
+    io_mappings: &[
+        Some(IOMapping::new(0x204d00000, 0x1c000, 0x1c000, true)), // Fender
+        Some(IOMapping::new(0x20e100000, 0x4000, 0x4000, false)),  // AICTimer
+        Some(IOMapping::new(0x23b104000, 0x4000, 0x4000, true)),   // AICSWInt
+        Some(IOMapping::new(0x204000000, 0x20000, 0x20000, true)), // RGX
+        None,                                                      // UVD
+        None,                                                      // unused
+        None,                                                      // DisplayUnderrunWA
+        Some(IOMapping::new(0x23b2e8000, 0x1000, 0x1000, false)),  // AnalogTempSensorControllerRegs
+        Some(IOMapping::new(0x23bc00000, 0x1000, 0x1000, true)),   // PMPDoorbell
+        Some(IOMapping::new(0x204d80000, 0x5000, 0x5000, true)),   // MetrologySensorRegs
+        Some(IOMapping::new(0x204d61000, 0x1000, 0x1000, true)),   // GMGIFAFRegs
+        Some(IOMapping::new(0x200000000, 0xd6400, 0xd6400, true)), // MCache registers
+        None,                                                      // AICBankedRegisters
+        Some(IOMapping::new(0x23b738000, 0x1000, 0x1000, true)),   // PMGRScratch
+        None, // NIA Special agent idle register die 0
+        None, // NIA Special agent idle register die 1
+        None, // CRE registers
+        None, // Streaming codec registers
+        None, //
+        None, //
+    ],
+};
diff --git a/drivers/gpu/drm/asahi/hw/t8112.rs b/drivers/gpu/drm/asahi/hw/t8112.rs
new file mode 100644
index 000000000000..5624dca130be
--- /dev/null
+++ b/drivers/gpu/drm/asahi/hw/t8112.rs
@@ -0,0 +1,82 @@
+// SPDX-License-Identifier: GPL-2.0-only OR MIT
+
+//! Hardware configuration for t8112 platforms (M2).
+
+use crate::f32;
+
+use super::*;
+
+pub(crate) const HWCONFIG: super::HwConfig = HwConfig {
+    chip_id: 0x8112,
+    gpu_gen: GpuGen::G14,
+    gpu_variant: GpuVariant::G,
+    gpu_core: GpuCore::G14G,
+    gpu_feat_compat: 0,
+    gpu_feat_incompat: 0,
+
+    base_clock_hz: 24_000_000,
+    uat_oas: 40,
+    max_num_clusters: 1,
+    max_num_cores: 10,
+    max_num_frags: 10,
+    max_num_gps: 4,
+
+    preempt1_size: 0x540,
+    preempt2_size: 0x280,
+    preempt3_size: 0x20,
+
+    render: HwRenderConfig {
+        // TODO: this is unused here, may be present in newer FW
+        tiling_control: 0xa041,
+    },
+
+    da: HwConfigA {
+        unk_87c: 900,
+        unk_8cc: 11000,
+        unk_e24: 125,
+    },
+    db: HwConfigB {
+        unk_4e0: 4,
+        unk_534: 0,
+        unk_ab8: 0x2048,
+        unk_abc: 0x4000,
+        unk_b30: 1,
+    },
+    shared1_tab: &[
+        0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+        0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    ],
+    shared1_a4: 0,
+    shared2_tab: &[-1, -1, -1, -1, -1, -1, -1, -1, 0xaa5aa, 0],
+    shared2_unk_508: 0xc00000,
+    sram_k: f32!(1.02),
+    // 13.2: last coef changed from 6.6 to 5.3, assuming that was a fix we can backport
+    unk_coef_a: &[&f32!([0.0, 0.0, 0.0, 0.0, 5.3, 0.0, 5.3, /*6.6*/ 5.3])],
+    unk_coef_b: &[&f32!([0.0, 0.0, 0.0, 0.0, 5.3, 0.0, 5.3, /*6.6*/ 5.3])],
+    global_tab: None,
+    fast_die0_sensor_mask: 0x6800,
+    fast_die0_sensor_mask_alt: 0x6800,
+    fast_die0_sensor_present: 0x02,
+    io_mappings: &[
+        Some(IOMapping::new(0x204d00000, 0x14000, 0x14000, true)), // Fender
+        Some(IOMapping::new(0x20e100000, 0x4000, 0x4000, false)),  // AICTimer
+        Some(IOMapping::new(0x23b0c4000, 0x4000, 0x4000, true)),   // AICSWInt
+        Some(IOMapping::new(0x204000000, 0x20000, 0x20000, true)), // RGX
+        None,                                                      // UVD
+        None,                                                      // unused
+        None,                                                      // DisplayUnderrunWA
+        Some(IOMapping::new(0x23b2c0000, 0x1000, 0x1000, false)),  // AnalogTempSensorControllerRegs
+        None,                                                      // PMPDoorbell
+        Some(IOMapping::new(0x204d80000, 0x8000, 0x8000, true)),   // MetrologySensorRegs
+        Some(IOMapping::new(0x204d61000, 0x1000, 0x1000, true)),   // GMGIFAFRegs
+        Some(IOMapping::new(0x200000000, 0xd6400, 0xd6400, true)), // MCache registers
+        None,                                                      // AICBankedRegisters
+        None,                                                      // PMGRScratch
+        None, // NIA Special agent idle register die 0
+        None, // NIA Special agent idle register die 1
+        Some(IOMapping::new(0x204e00000, 0x10000, 0x10000, true)), // CRE registers
+        Some(IOMapping::new(0x27d050000, 0x4000, 0x4000, true)), // Streaming codec registers
+        Some(IOMapping::new(0x23b3d0000, 0x1000, 0x1000, true)), //
+        Some(IOMapping::new(0x23b3c0000, 0x1000, 0x1000, true)), //
+    ],
+};
diff --git a/drivers/gpu/drm/asahi/initdata.rs b/drivers/gpu/drm/asahi/initdata.rs
new file mode 100644
index 000000000000..472c42169130
--- /dev/null
+++ b/drivers/gpu/drm/asahi/initdata.rs
@@ -0,0 +1,777 @@
+// SPDX-License-Identifier: GPL-2.0-only OR MIT
+#![allow(clippy::unusual_byte_groupings)]
+
+//! GPU initialization data builder.
+//!
+//! The root of all interaction between the GPU firmware and the host driver is a complex set of
+//! nested structures that we call InitData. This includes both GPU hardware/firmware configuration
+//! and the pointers to the ring buffers and global data fields that are used for communication at
+//! runtime.
+//!
+//! Many of these structures are poorly understood, so there are lots of hardcoded unknown values
+//! derived from observing the InitData structures that macOS generates.
+
+use crate::fw::initdata::*;
+use crate::fw::types::*;
+use crate::{box_in_place, f32, place};
+use crate::{gpu, hw, mmu};
+use kernel::error::Result;
+use kernel::macros::versions;
+
+/// Builder helper for the global GPU InitData.
+#[versions(AGX)]
+pub(crate) struct InitDataBuilder<'a> {
+    alloc: &'a mut gpu::KernelAllocators,
+    cfg: &'a hw::HwConfig,
+    dyncfg: &'a hw::DynConfig,
+}
+
+#[versions(AGX)]
+impl<'a> InitDataBuilder::ver<'a> {
+    /// Create a new InitData builder
+    pub(crate) fn new(
+        alloc: &'a mut gpu::KernelAllocators,
+        cfg: &'a hw::HwConfig,
+        dyncfg: &'a hw::DynConfig,
+    ) -> InitDataBuilder::ver<'a> {
+        InitDataBuilder::ver { alloc, cfg, dyncfg }
+    }
+
+    /// Create the HwDataShared1 structure, which is used in two places in InitData.
+    #[inline(never)]
+    fn hw_shared1(cfg: &hw::HwConfig) -> raw::HwDataShared1 {
+        let mut ret = raw::HwDataShared1 {
+            unk_a4: cfg.shared1_a4,
+            ..Default::default()
+        };
+        for (i, val) in cfg.shared1_tab.iter().enumerate() {
+            ret.table[i] = *val;
+        }
+        ret
+    }
+
+    fn init_curve(
+        curve: &mut raw::HwDataShared2Curve,
+        unk_0: u32,
+        unk_4: u32,
+        t1: &[i16],
+        t2: &[i16],
+        t3: &[&[i32]],
+    ) {
+        curve.unk_0 = unk_0;
+        curve.unk_4 = unk_4;
+        (*curve.t1)[..t1.len()].copy_from_slice(t1);
+        (*curve.t1)[t1.len()..].fill(t1[0]);
+        (*curve.t2)[..t2.len()].copy_from_slice(t2);
+        (*curve.t2)[t2.len()..].fill(t2[0]);
+        for (i, a) in curve.t3.iter_mut().enumerate() {
+            a.fill(0x3ffffff);
+            if i < t3.len() {
+                let b = t3[i];
+                (**a)[..b.len()].copy_from_slice(b);
+            }
+        }
+    }
+
+    /// Create the HwDataShared2 structure, which is used in two places in InitData.
+    #[inline(never)]
+    fn hw_shared2(cfg: &hw::HwConfig) -> Result<Box<raw::HwDataShared2>> {
+        let mut ret = box_in_place!(raw::HwDataShared2 {
+            unk_28: Array::new([0xff; 16]),
+            t8112: Default::default(),
+            unk_508: cfg.shared2_unk_508,
+            ..Default::default()
+        })?;
+
+        for (i, val) in cfg.shared2_tab.iter().enumerate() {
+            ret.table[i] = *val;
+        }
+
+        if cfg.chip_id == 0x8112 {
+            ret.t8112.unk_14 = 0x6000000;
+            Self::init_curve(&mut ret.t8112.curve1, 0, 0x20000000, &[-1], &[0x0f07], &[]);
+            Self::init_curve(
+                &mut ret.t8112.curve2,
+                7,
+                0x80000000,
+                &[-1, 25740, 17429, 12550, 9597, 7910, 6657, 5881, 5421],
+                &[
+                    0x0f07, 0x04c0, 0x06c0, 0x08c0, 0x0ac0, 0x0c40, 0x0dc0, 0x0ec0, 0x0f80,
+                ],
+                &[
+                    &[0x3ffffff, 107, 101, 94, 87, 82, 77, 73, 71],
+                    &[
+                        0x3ffffff, 38240, 36251, 33562, 31368, 29379, 27693, 26211, 25370,
+                    ],
+                    &[
+                        0x3ffffff, 123933, 117485, 108771, 101661, 95217, 89751, 84948, 82222,
+                    ],
+                ],
+            );
+        }
+
+        Ok(ret)
+    }
+
+    /// Create the HwDataShared3 structure, which is used in two places in InitData.
+    #[inline(never)]
+    fn hw_shared3(cfg: &hw::HwConfig) -> Result<Box<raw::HwDataShared3>> {
+        let mut ret = box_in_place!(raw::HwDataShared3 {
+            ..Default::default()
+        })?;
+
+        if cfg.chip_id == 0x8112 {
+            ret.unk_0 = 1;
+            ret.unk_4 = 500;
+            ret.unk_8 = 5;
+            ret.table.copy_from_slice(&[
+                10700, 10700, 10700, 10700, 10700, 6000, 1000, 1000, 1000, 10700, 10700, 10700,
+                10700, 10700, 10700, 10700,
+            ]);
+            ret.unk_4c = 1;
+        }
+
+        Ok(ret)
+    }
+
+    /// Create an unknown T81xx-specific data structure.
+    fn t81xx_data(dyncfg: &'a hw::DynConfig) -> raw::T81xxData {
+        raw::T81xxData {
+            unk_d8c: 0x80000000,
+            unk_d90: 4,
+            unk_d9c: f32!(0.6),
+            unk_da4: f32!(0.4),
+            unk_dac: f32!(0.38552),
+            unk_db8: f32!(65536.0),
+            unk_dbc: f32!(13.56),
+            max_pstate_scaled: 100 * dyncfg.pwr.perf_max_pstate,
+            ..Default::default()
+        }
+    }
+
+    /// Create the HwDataA structure. This mostly contains power-related configuration.
+    #[inline(never)]
+    fn hwdata_a(&mut self) -> Result<GpuObject<HwDataA::ver>> {
+        self.alloc
+            .private
+            .new_inplace(Default::default(), |_inner, ptr| {
+                let pwr = &self.dyncfg.pwr;
+                let period_ms = pwr.power_sample_period;
+                let period_s = F32::from(period_ms) / f32!(1000.0);
+                let ppm_filter_tc_periods = pwr.ppm_filter_time_constant_ms / period_ms;
+                #[ver(V >= V13_0B4)]
+                let ppm_filter_tc_ms_rounded = ppm_filter_tc_periods * period_ms;
+                let ppm_filter_a = f32!(1.0) / ppm_filter_tc_periods.into();
+                let perf_filter_a = f32!(1.0) / pwr.perf_filter_time_constant.into();
+                let perf_filter_a2 = f32!(1.0) / pwr.perf_filter_time_constant2.into();
+                let avg_power_target_filter_a = f32!(1.0) / pwr.avg_power_target_filter_tc.into();
+                let avg_power_filter_tc_periods = pwr.avg_power_filter_tc_ms / period_ms;
+                #[ver(V >= V13_0B4)]
+                let avg_power_filter_tc_ms_rounded = avg_power_filter_tc_periods * period_ms;
+                let avg_power_filter_a = f32!(1.0) / avg_power_filter_tc_periods.into();
+                let pwr_filter_a = f32!(1.0) / pwr.pwr_filter_time_constant.into();
+
+                let base_ps = pwr.perf_base_pstate;
+                let base_ps_scaled = 100 * base_ps;
+                let max_ps = pwr.perf_max_pstate;
+                let max_ps_scaled = 100 * max_ps;
+                let boost_ps_count = max_ps - base_ps;
+
+                let base_clock_khz = self.cfg.base_clock_hz / 1000;
+                let clocks_per_period = base_clock_khz * period_ms;
+
+                let raw = place!(
+                    ptr,
+                    raw::HwDataA::ver {
+                        clocks_per_period: clocks_per_period,
+                        #[ver(V >= V13_0B4)]
+                        clocks_per_period_2: clocks_per_period,
+                        pwr_status: AtomicU32::new(4),
+                        unk_10: f32!(1.0),
+                        actual_pstate: 1,
+                        tgt_pstate: 1,
+                        base_pstate_scaled: base_ps_scaled,
+                        unk_40: 1,
+                        max_pstate_scaled: max_ps_scaled,
+                        min_pstate_scaled: 100,
+                        unk_64c: 625,
+                        pwr_filter_a_neg: f32!(1.0) - pwr_filter_a,
+                        pwr_filter_a: pwr_filter_a,
+                        pwr_integral_gain: pwr.pwr_integral_gain,
+                        pwr_integral_min_clamp: pwr.pwr_integral_min_clamp.into(),
+                        max_power_1: pwr.max_power_mw.into(),
+                        pwr_proportional_gain: pwr.pwr_proportional_gain,
+                        pwr_pstate_related_k: -F32::from(max_ps_scaled) / pwr.max_power_mw.into(),
+                        pwr_pstate_max_dc_offset: pwr.pwr_min_duty_cycle as i32
+                            - max_ps_scaled as i32,
+                        max_pstate_scaled_2: max_ps_scaled,
+                        max_power_2: pwr.max_power_mw,
+                        max_pstate_scaled_3: max_ps_scaled,
+                        ppm_filter_tc_periods_x4: ppm_filter_tc_periods * 4,
+                        ppm_filter_a_neg: f32!(1.0) - ppm_filter_a,
+                        ppm_filter_a: ppm_filter_a,
+                        ppm_ki_dt: pwr.ppm_ki * period_s,
+                        unk_6fc: f32!(65536.0),
+                        ppm_kp: pwr.ppm_kp,
+                        pwr_min_duty_cycle: pwr.pwr_min_duty_cycle,
+                        max_pstate_scaled_4: max_ps_scaled,
+                        unk_71c: f32!(0.0),
+                        max_power_3: pwr.max_power_mw,
+                        cur_power_mw_2: 0x0,
+                        ppm_filter_tc_ms: pwr.ppm_filter_time_constant_ms,
+                        #[ver(V >= V13_0B4)]
+                        ppm_filter_tc_clks: ppm_filter_tc_ms_rounded * base_clock_khz,
+                        perf_tgt_utilization: pwr.perf_tgt_utilization,
+                        perf_boost_min_util: pwr.perf_boost_min_util,
+                        perf_boost_ce_step: pwr.perf_boost_ce_step,
+                        perf_reset_iters: pwr.perf_reset_iters,
+                        unk_774: 6,
+                        unk_778: 1,
+                        perf_filter_drop_threshold: pwr.perf_filter_drop_threshold,
+                        perf_filter_a_neg: f32!(1.0) - perf_filter_a,
+                        perf_filter_a2_neg: f32!(1.0) - perf_filter_a2,
+                        perf_filter_a: perf_filter_a,
+                        perf_filter_a2: perf_filter_a2,
+                        perf_ki: pwr.perf_integral_gain,
+                        perf_ki2: pwr.perf_integral_gain2,
+                        perf_integral_min_clamp: pwr.perf_integral_min_clamp.into(),
+                        unk_79c: f32!(95.0),
+                        perf_kp: pwr.perf_proportional_gain,
+                        perf_kp2: pwr.perf_proportional_gain2,
+                        boost_state_unk_k: F32::from(boost_ps_count) / f32!(0.95),
+                        base_pstate_scaled_2: base_ps_scaled,
+                        max_pstate_scaled_5: max_ps_scaled,
+                        base_pstate_scaled_3: base_ps_scaled,
+                        perf_tgt_utilization_2: pwr.perf_tgt_utilization,
+                        base_pstate_scaled_4: base_ps_scaled,
+                        unk_7fc: f32!(65536.0),
+                        pwr_min_duty_cycle_2: pwr.pwr_min_duty_cycle.into(),
+                        max_pstate_scaled_6: max_ps_scaled.into(),
+                        max_freq_mhz: pwr.max_freq_mhz,
+                        pwr_min_duty_cycle_3: pwr.pwr_min_duty_cycle,
+                        min_pstate_scaled_4: f32!(100.0),
+                        max_pstate_scaled_7: max_ps_scaled,
+                        unk_alpha_neg: f32!(0.8),
+                        unk_alpha: f32!(0.2),
+                        fast_die0_sensor_mask: U64(self.cfg.fast_die0_sensor_mask),
+                        fast_die0_release_temp_cc: 100 * pwr.fast_die0_release_temp,
+                        unk_87c: self.cfg.da.unk_87c,
+                        unk_880: 0x4,
+                        unk_894: f32!(1.0),
+
+                        fast_die0_ki_dt: pwr.fast_die0_integral_gain * period_s,
+                        unk_8a8: f32!(65536.0),
+                        fast_die0_kp: pwr.fast_die0_proportional_gain,
+                        pwr_min_duty_cycle_4: pwr.pwr_min_duty_cycle,
+                        max_pstate_scaled_8: max_ps_scaled,
+                        max_pstate_scaled_9: max_ps_scaled,
+                        fast_die0_prop_tgt_delta: 100 * pwr.fast_die0_prop_tgt_delta,
+                        unk_8cc: self.cfg.da.unk_8cc,
+                        max_pstate_scaled_10: max_ps_scaled,
+                        max_pstate_scaled_11: max_ps_scaled,
+                        unk_c2c: 1,
+                        power_zone_count: pwr.power_zones.len() as u32,
+                        max_power_4: pwr.max_power_mw,
+                        max_power_5: pwr.max_power_mw,
+                        max_power_6: pwr.max_power_mw,
+                        avg_power_target_filter_a_neg: f32!(1.0) - avg_power_target_filter_a,
+                        avg_power_target_filter_a: avg_power_target_filter_a,
+                        avg_power_target_filter_tc_x4: 4 * pwr.avg_power_target_filter_tc,
+                        avg_power_target_filter_tc_xperiod: period_ms
+                            * pwr.avg_power_target_filter_tc,
+                        #[ver(V >= V13_0B4)]
+                        avg_power_target_filter_tc_clks: period_ms
+                            * pwr.avg_power_target_filter_tc
+                            * base_clock_khz,
+                        avg_power_filter_tc_periods_x4: 4 * avg_power_filter_tc_periods,
+                        avg_power_filter_a_neg: f32!(1.0) - avg_power_filter_a,
+                        avg_power_filter_a: avg_power_filter_a,
+                        avg_power_ki_dt: pwr.avg_power_ki_only * period_s,
+                        unk_d20: f32!(65536.0),
+                        avg_power_kp: pwr.avg_power_kp,
+                        avg_power_min_duty_cycle: pwr.avg_power_min_duty_cycle,
+                        max_pstate_scaled_12: max_ps_scaled,
+                        max_pstate_scaled_13: max_ps_scaled,
+                        max_power_7: pwr.max_power_mw.into(),
+                        max_power_8: pwr.max_power_mw,
+                        avg_power_filter_tc_ms: pwr.avg_power_filter_tc_ms,
+                        #[ver(V >= V13_0B4)]
+                        avg_power_filter_tc_clks: avg_power_filter_tc_ms_rounded * base_clock_khz,
+                        max_pstate_scaled_14: max_ps_scaled,
+                        t81xx_data: match self.cfg.chip_id {
+                            0x8103 | 0x8112 => Self::t81xx_data(self.dyncfg),
+                            _ => Default::default(),
+                        },
+                        #[ver(V >= V13_0B4)]
+                        unk_e10_0: raw::HwDataA130Extra {
+                            unk_38: 4,
+                            unk_3c: 8000,
+                            unk_40: 2500,
+                            unk_48: 0xffffffff,
+                            unk_4c: 50,
+                            unk_54: 50,
+                            unk_58: 0x1,
+                            unk_60: f32!(0.8888889),
+                            unk_64: f32!(0.6666667),
+                            unk_68: f32!(0.11111111),
+                            unk_6c: f32!(0.33333333),
+                            unk_70: f32!(-0.4),
+                            unk_74: f32!(-0.8),
+                            unk_7c: f32!(65536.0),
+                            unk_80: f32!(-5.0),
+                            unk_84: f32!(-10.0),
+                            unk_8c: 40,
+                            max_pstate_scaled_1: max_ps_scaled,
+                            unk_9c: f32!(8000.0),
+                            unk_a0: 1400,
+                            unk_a8: 72,
+                            unk_ac: 24,
+                            unk_b0: 1728000,
+                            unk_b8: 576000,
+                            unk_c4: f32!(65536.0),
+                            unk_114: f32!(65536.0),
+                            unk_124: 40,
+                            max_pstate_scaled_2: max_ps_scaled,
+                            ..Default::default()
+                        },
+                        fast_die0_sensor_mask_2: U64(self.cfg.fast_die0_sensor_mask),
+                        unk_e24: self.cfg.da.unk_e24,
+                        unk_e28: 1,
+                        fast_die0_sensor_mask_alt: U64(self.cfg.fast_die0_sensor_mask_alt),
+                        #[ver(V < V13_0B4)]
+                        fast_die0_sensor_present: U64(self.cfg.fast_die0_sensor_present as u64),
+                        unk_163c: 1,
+                        unk_3644: 0,
+                        hws1: Self::hw_shared1(self.cfg),
+                        hws2: *Self::hw_shared2(self.cfg)?,
+                        hws3: *Self::hw_shared3(self.cfg)?,
+                        unk_3ce8: 1,
+                        ..Default::default()
+                    }
+                );
+
+                for i in 0..self.dyncfg.pwr.perf_states.len() {
+                    raw.sram_k[i] = self.cfg.sram_k;
+                }
+
+                for (i, coef) in pwr.core_leak_coef.iter().enumerate() {
+                    raw.core_leak_coef[i] = *coef;
+                }
+
+                for (i, coef) in pwr.sram_leak_coef.iter().enumerate() {
+                    raw.sram_leak_coef[i] = *coef;
+                }
+
+                for i in 0..self.dyncfg.id.num_clusters as usize {
+                    if let Some(coef_a) = self.cfg.unk_coef_a.get(i) {
+                        (*raw.unk_coef_a1[i])[..coef_a.len()].copy_from_slice(coef_a);
+                        (*raw.unk_coef_a2[i])[..coef_a.len()].copy_from_slice(coef_a);
+                    }
+                    if let Some(coef_b) = self.cfg.unk_coef_b.get(i) {
+                        (*raw.unk_coef_b1[i])[..coef_b.len()].copy_from_slice(coef_b);
+                        (*raw.unk_coef_b2[i])[..coef_b.len()].copy_from_slice(coef_b);
+                    }
+                }
+
+                for (i, pz) in pwr.power_zones.iter().enumerate() {
+                    raw.power_zones[i].target = pz.target;
+                    raw.power_zones[i].target_off = pz.target - pz.target_offset;
+                    raw.power_zones[i].filter_tc_x4 = 4 * pz.filter_tc;
+                    raw.power_zones[i].filter_tc_xperiod = period_ms * pz.filter_tc;
+                    let filter_a = f32!(1.0) / pz.filter_tc.into();
+                    raw.power_zones[i].filter_a = filter_a;
+                    raw.power_zones[i].filter_a_neg = f32!(1.0) - filter_a;
+                    #[ver(V >= V13_0B4)]
+                    raw.power_zones[i].unk_10 = 1320000000;
+                }
+
+                Ok(raw)
+            })
+    }
+
+    /// Create the HwDataB structure. This mostly contains GPU-related configuration.
+    #[inline(never)]
+    fn hwdata_b(&mut self) -> Result<GpuObject<HwDataB::ver>> {
+        self.alloc
+            .private
+            .new_inplace(Default::default(), |_inner, ptr| {
+                let raw = place!(
+                    ptr,
+                    raw::HwDataB::ver {
+                        // Userspace VA map related
+                        #[ver(V < V13_0B4)]
+                        unk_0: U64(0x13_00000000),
+                        unk_8: U64(0x14_00000000),
+                        #[ver(V < V13_0B4)]
+                        unk_10: U64(0x1_00000000),
+                        unk_18: U64(0xffc00000),
+                        unk_20: U64(0x11_00000000),
+                        unk_28: U64(0x11_00000000),
+                        // userspace address?
+                        unk_30: U64(0x6f_ffff8000),
+                        // unmapped?
+                        unkptr_38: U64(0xffffffa0_11800000),
+                        // TODO: yuv matrices
+                        chip_id: self.cfg.chip_id,
+                        unk_454: 0x1,
+                        unk_458: 0x1,
+                        unk_460: 0x1,
+                        unk_464: 0x1,
+                        unk_468: 0x1,
+                        unk_47c: 0x1,
+                        unk_484: 0x1,
+                        unk_48c: 0x1,
+                        base_clock_khz: self.cfg.base_clock_hz / 1000,
+                        power_sample_period: self.dyncfg.pwr.power_sample_period,
+                        unk_49c: 0x1,
+                        unk_4a0: 0x1,
+                        unk_4a4: 0x1,
+                        unk_4c0: 0x1f,
+                        unk_4e0: U64(self.cfg.db.unk_4e0),
+                        unk_4f0: 0x1,
+                        unk_4f4: 0x1,
+                        unk_504: 0x31,
+                        unk_524: 0x1, // use_secure_cache_flush
+                        unk_534: self.cfg.db.unk_534,
+                        num_frags: self.dyncfg.id.num_frags * self.dyncfg.id.num_clusters,
+                        unk_554: 0x1,
+                        uat_ttb_base: U64(self.dyncfg.uat_ttb_base),
+                        gpu_core_id: self.cfg.gpu_core as u32,
+                        gpu_rev_id: self.dyncfg.id.gpu_rev_id as u32,
+                        num_cores: self.dyncfg.id.num_cores * self.dyncfg.id.num_clusters,
+                        max_pstate: self.dyncfg.pwr.perf_states.len() as u32 - 1,
+                        #[ver(V < V13_0B4)]
+                        num_pstates: self.dyncfg.pwr.perf_states.len() as u32,
+                        #[ver(V < V13_0B4)]
+                        min_sram_volt: self.dyncfg.pwr.min_sram_microvolt / 1000,
+                        #[ver(V < V13_0B4)]
+                        unk_ab8: self.cfg.db.unk_ab8,
+                        #[ver(V < V13_0B4)]
+                        unk_abc: self.cfg.db.unk_abc,
+                        #[ver(V < V13_0B4)]
+                        unk_ac0: 0x1020,
+
+                        #[ver(V >= V13_0B4)]
+                        unk_ae4: Array::new([0x0, 0x3, 0x7, 0x7]),
+                        #[ver(V < V13_0B4)]
+                        unk_ae4: Array::new([0x0, 0xf, 0x3f, 0x3f]),
+                        unk_b10: 0x1,
+                        unk_b24: 0x1,
+                        unk_b28: 0x1,
+                        unk_b2c: 0x1,
+                        unk_b30: self.cfg.db.unk_b30,
+                        #[ver(V >= V13_0B4)]
+                        unk_b38_0: 1,
+                        #[ver(V >= V13_0B4)]
+                        unk_b38_4: 1,
+                        unk_b38: Array::new([0xffffffff; 12]),
+                        #[ver(V >= V13_0B4)]
+                        unk_c3c: 0x19,
+                        ..Default::default()
+                    }
+                );
+
+                let base_ps = self.dyncfg.pwr.perf_base_pstate as usize;
+                let max_ps = self.dyncfg.pwr.perf_max_pstate as usize;
+                let base_freq = self.dyncfg.pwr.perf_states[base_ps].freq_hz;
+                let max_freq = self.dyncfg.pwr.perf_states[max_ps].freq_hz;
+
+                for (i, ps) in self.dyncfg.pwr.perf_states.iter().enumerate() {
+                    raw.frequencies[i] = ps.freq_hz / 1000000;
+                    for (j, mv) in ps.volt_mv.iter().enumerate() {
+                        let sram_mv = (*mv).max(self.dyncfg.pwr.min_sram_microvolt / 1000);
+                        raw.voltages[i][j] = *mv;
+                        raw.voltages_sram[i][j] = sram_mv;
+                    }
+                    raw.sram_k[i] = self.cfg.sram_k;
+                    raw.rel_max_powers[i] = ps.pwr_mw * 100 / self.dyncfg.pwr.max_power_mw;
+                    raw.rel_boost_freqs[i] = if i > base_ps {
+                        (ps.freq_hz - base_freq) / ((max_freq - base_freq) / 100)
+                    } else {
+                        0
+                    };
+                }
+
+                Ok(raw)
+            })
+    }
+
+    /// Create the Globals structure, which contains global firmware config including more power
+    /// configuration data and globals used to exchange state between the firmware and driver.
+    #[inline(never)]
+    fn globals(&mut self) -> Result<GpuObject<Globals::ver>> {
+        self.alloc
+            .shared
+            .new_inplace(Default::default(), |_inner, ptr| {
+                let pwr = &self.dyncfg.pwr;
+                let period_ms = pwr.power_sample_period;
+                let period_s = F32::from(period_ms) / f32!(1000.0);
+                let avg_power_filter_tc_periods = pwr.avg_power_filter_tc_ms / period_ms;
+
+                let max_ps = pwr.perf_max_pstate;
+                let max_ps_scaled = 100 * max_ps;
+
+                let raw = place!(
+                    ptr,
+                    raw::Globals::ver {
+                        //ktrace_enable: 0xffffffff,
+                        ktrace_enable: 0,
+                        #[ver(V >= V13_2)]
+                        unk_24_0: 3000,
+                        unk_24: 0,
+                        #[ver(V >= V13_0B4)]
+                        unk_28_0: 0, // debug
+                        unk_28: 1,
+                        #[ver(V >= V13_0B4)]
+                        unk_2c_0: 0,
+                        unk_2c: 1,
+                        unk_30: 0,
+                        unk_34: 120,
+                        sub: raw::GlobalsSub::ver {
+                            unk_54: 0xffff,
+                            unk_56: 40,
+                            unk_58: 0xffff,
+                            unk_5e: U32(1),
+                            unk_66: U32(1),
+                            ..Default::default()
+                        },
+                        unk_8900: 1,
+                        pending_submissions: AtomicU32::new(0),
+                        max_power: pwr.max_power_mw,
+                        max_pstate_scaled: max_ps_scaled,
+                        max_pstate_scaled_2: max_ps_scaled,
+                        max_pstate_scaled_3: max_ps_scaled,
+                        power_zone_count: pwr.power_zones.len() as u32,
+                        avg_power_filter_tc_periods: avg_power_filter_tc_periods,
+                        avg_power_ki_dt: pwr.avg_power_ki_only * period_s,
+                        avg_power_kp: pwr.avg_power_kp,
+                        avg_power_min_duty_cycle: pwr.avg_power_min_duty_cycle,
+                        avg_power_target_filter_tc: pwr.avg_power_target_filter_tc,
+                        unk_89bc: self.cfg.da.unk_8cc,
+                        fast_die0_release_temp: 100 * pwr.fast_die0_release_temp,
+                        unk_89c4: self.cfg.da.unk_87c,
+                        fast_die0_prop_tgt_delta: 100 * pwr.fast_die0_prop_tgt_delta,
+                        fast_die0_kp: pwr.fast_die0_proportional_gain,
+                        fast_die0_ki_dt: pwr.fast_die0_integral_gain * period_s,
+                        unk_89e0: 1,
+                        max_power_2: pwr.max_power_mw,
+                        ppm_kp: pwr.ppm_kp,
+                        ppm_ki_dt: pwr.ppm_ki * period_s,
+                        #[ver(V >= V13_0B4)]
+                        unk_89f4_8: 1,
+                        unk_89f4: 0,
+                        hws1: Self::hw_shared1(self.cfg),
+                        hws2: *Self::hw_shared2(self.cfg)?,
+                        hws3: *Self::hw_shared3(self.cfg)?,
+                        unk_900c: 1,
+                        #[ver(V >= V13_0B4)]
+                        unk_9010_0: 1,
+                        #[ver(V >= V13_0B4)]
+                        unk_903c: 1,
+                        #[ver(V < V13_0B4)]
+                        unk_903c: 0,
+                        fault_control: *crate::fault_control.read(),
+                        do_init: 1,
+                        unk_11020: 40,
+                        unk_11024: 10,
+                        unk_11028: 250,
+                        #[ver(V >= V13_0B4)]
+                        unk_1102c_0: 1,
+                        #[ver(V >= V13_0B4)]
+                        unk_1102c_4: 1,
+                        #[ver(V >= V13_0B4)]
+                        unk_1102c_8: 100,
+                        #[ver(V >= V13_0B4)]
+                        unk_1102c_c: 1,
+                        idle_off_delay_ms: AtomicU32::new(pwr.idle_off_delay_ms),
+                        fender_idle_off_delay_ms: pwr.fender_idle_off_delay_ms,
+                        fw_early_wake_timeout_ms: pwr.fw_early_wake_timeout_ms,
+                        unk_118e0: 40,
+                        #[ver(V >= V13_0B4)]
+                        unk_118e4_0: 50,
+                        #[ver(V >= V13_0B4)]
+                        unk_11edc: 0,
+                        #[ver(V >= V13_0B4)]
+                        unk_11efc: 0,
+                        ..Default::default()
+                    }
+                );
+
+                for (i, pz) in pwr.power_zones.iter().enumerate() {
+                    raw.power_zones[i].target = pz.target;
+                    raw.power_zones[i].target_off = pz.target - pz.target_offset;
+                    raw.power_zones[i].filter_tc = pz.filter_tc;
+                }
+
+                if let Some(tab) = self.cfg.global_tab.as_ref() {
+                    for (i, x) in tab.iter().enumerate() {
+                        raw.unk_118ec[i] = *x;
+                    }
+                    raw.unk_118e8 = 1;
+                }
+
+                Ok(raw)
+            })
+    }
+
+    /// Create the RuntimePointers structure, which contains pointers to most of the other
+    /// structures including the ring buffer channels, statistics structures, and HwDataA/HwDataB.
+    #[inline(never)]
+    fn runtime_pointers(&mut self) -> Result<GpuObject<RuntimePointers::ver>> {
+        let hwa = self.hwdata_a()?;
+        let hwb = self.hwdata_b()?;
+
+        let pointers: Box<RuntimePointers::ver> = box_in_place!(RuntimePointers::ver {
+            stats: Stats::ver {
+                vtx: self.alloc.private.new_default::<GpuGlobalStatsVtx::ver>()?,
+                frag: self.alloc.private.new_inplace(
+                    Default::default(),
+                    |_inner, ptr: &mut MaybeUninit<raw::GpuGlobalStatsFrag::ver>| {
+                        Ok(place!(
+                            ptr,
+                            raw::GpuGlobalStatsFrag::ver {
+                                stats: raw::GpuStatsFrag::ver {
+                                    cur_stamp_id: -1,
+                                    unk_118: -1,
+                                    ..Default::default()
+                                },
+                                ..Default::default()
+                            }
+                        ))
+                    },
+                )?,
+                comp: self.alloc.private.new_default::<GpuStatsComp>()?,
+            },
+
+            hwdata_a: hwa,
+            unkptr_190: self.alloc.private.array_empty(0x80)?,
+            unkptr_198: self.alloc.private.array_empty(0xc0)?,
+            hwdata_b: hwb,
+
+            unkptr_1b8: self.alloc.private.array_empty(0x1000)?,
+            unkptr_1c0: self.alloc.private.array_empty(0x300)?,
+            unkptr_1c8: self.alloc.private.array_empty(0x1000)?,
+
+            buffer_mgr_ctl: self.alloc.gpu.array_empty(127)?,
+        })?;
+
+        self.alloc.private.new_boxed(pointers, |inner, ptr| {
+            Ok(place!(
+                ptr,
+                raw::RuntimePointers::ver {
+                    pipes: Default::default(),
+                    device_control: Default::default(),
+                    event: Default::default(),
+                    fw_log: Default::default(),
+                    ktrace: Default::default(),
+                    stats: Default::default(),
+
+                    stats_vtx: inner.stats.vtx.gpu_pointer(),
+                    stats_frag: inner.stats.frag.gpu_pointer(),
+                    stats_comp: inner.stats.comp.gpu_pointer(),
+
+                    hwdata_a: inner.hwdata_a.gpu_pointer(),
+                    unkptr_190: inner.unkptr_190.gpu_pointer(),
+                    unkptr_198: inner.unkptr_198.gpu_pointer(),
+                    hwdata_b: inner.hwdata_b.gpu_pointer(),
+                    hwdata_b_2: inner.hwdata_b.gpu_pointer(),
+
+                    fwlog_buf: None,
+
+                    unkptr_1b8: inner.unkptr_1b8.gpu_pointer(),
+                    unkptr_1c0: inner.unkptr_1c0.gpu_pointer(),
+                    unkptr_1c8: inner.unkptr_1c8.gpu_pointer(),
+
+                    buffer_mgr_ctl: inner.buffer_mgr_ctl.gpu_pointer(),
+                    buffer_mgr_ctl_2: inner.buffer_mgr_ctl.gpu_pointer(),
+
+                    __pad0: Default::default(),
+                    unk_160: U64(0),
+                    unk_168: U64(0),
+                    unk_1d0: 0,
+                    unk_1d4: 0,
+                    unk_1d8: Default::default(),
+
+                    __pad1: Default::default(),
+                    gpu_scratch: raw::RuntimeScratch {
+                        unk_6b38: 0xff,
+                        ..Default::default()
+                    },
+                }
+            ))
+        })
+    }
+
+    /// Create the FwStatus structure, which is used to coordinate the firmware halt state between
+    /// the firmware and the driver.
+    #[inline(never)]
+    fn fw_status(&mut self) -> Result<GpuObject<FwStatus>> {
+        self.alloc
+            .shared
+            .new_object(Default::default(), |_inner| Default::default())
+    }
+
+    /// Create one UatLevelInfo structure, which describes one level of translation for the UAT MMU.
+    #[inline(never)]
+    fn uat_level_info(
+        cfg: &hw::HwConfig,
+        index_shift: usize,
+        num_entries: usize,
+    ) -> raw::UatLevelInfo {
+        raw::UatLevelInfo {
+            index_shift: index_shift as _,
+            unk_1: 14,
+            unk_2: 14,
+            unk_3: 8,
+            unk_4: 0x4000,
+            num_entries: num_entries as _,
+            unk_8: U64(1),
+            unk_10: U64(((1u64 << cfg.uat_oas) - 1) & !(mmu::UAT_PGMSK as u64)),
+            index_mask: U64(((num_entries - 1) << index_shift) as u64),
+        }
+    }
+
+    /// Build the top-level InitData object.
+    #[inline(never)]
+    pub(crate) fn build(&mut self) -> Result<Box<GpuObject<InitData::ver>>> {
+        let inner: Box<InitData::ver> = box_in_place!(InitData::ver {
+            unk_buf: self.alloc.shared_ro.array_empty(0x4000)?,
+            runtime_pointers: self.runtime_pointers()?,
+            globals: self.globals()?,
+            fw_status: self.fw_status()?,
+        })?;
+
+        Ok(Box::try_new(self.alloc.shared_ro.new_boxed(
+            inner,
+            |inner, ptr| {
+                Ok(place!(
+                    ptr,
+                    raw::InitData::ver {
+                        #[ver(V >= V13_0B4)]
+                        ver_info: Array::new([1, 1, 16, 1]),
+                        unk_buf: inner.unk_buf.gpu_pointer(),
+                        unk_8: 0,
+                        unk_c: 0,
+                        runtime_pointers: inner.runtime_pointers.gpu_pointer(),
+                        globals: inner.globals.gpu_pointer(),
+                        fw_status: inner.fw_status.gpu_pointer(),
+                        uat_page_size: 0x4000,
+                        uat_page_bits: 14,
+                        uat_num_levels: 3,
+                        uat_level_info: Array::new([
+                            Self::uat_level_info(self.cfg, 36, 8),
+                            Self::uat_level_info(self.cfg, 25, 2048),
+                            Self::uat_level_info(self.cfg, 14, 2048),
+                        ]),
+                        __pad0: Default::default(),
+                        host_mapped_fw_allocations: 1,
+                        unk_ac: 0,
+                        unk_b0: 0,
+                        unk_b4: 0,
+                        unk_b8: 0,
+                    }
+                ))
+            },
+        )?)?)
+    }
+}
diff --git a/drivers/gpu/drm/asahi/mem.rs b/drivers/gpu/drm/asahi/mem.rs
new file mode 100644
index 000000000000..491d4f8a4016
--- /dev/null
+++ b/drivers/gpu/drm/asahi/mem.rs
@@ -0,0 +1,133 @@
+// SPDX-License-Identifier: GPL-2.0-only OR MIT
+
+//! ARM64 low level memory operations.
+//!
+//! This GPU uses CPU-side `tlbi` outer-shareable instructions to manage its TLBs.
+//! Yes, really. Even though the VA address spaces are unrelated.
+//!
+//! Right now we pick our own ASIDs and don't coordinate with the CPU. This might result
+//! in needless TLB shootdowns on the CPU side... TODO: fix this.
+
+use core::arch::asm;
+use core::cmp::min;
+
+use crate::debug::*;
+use crate::mmu;
+
+type Asid = u8;
+
+/// Invalidate the entire GPU TLB.
+#[inline(always)]
+pub(crate) fn tlbi_all() {
+    unsafe {
+        asm!(".arch armv8.4-a", "tlbi vmalle1os",);
+    }
+}
+
+/// Invalidate all TLB entries for a given ASID.
+#[inline(always)]
+pub(crate) fn tlbi_asid(asid: Asid) {
+    if debug_enabled(DebugFlags::ConservativeTlbi) {
+        tlbi_all();
+        sync();
+        return;
+    }
+
+    unsafe {
+        asm!(
+            ".arch armv8.4-a",
+            "tlbi aside1os, {x}",
+            x = in(reg) ((asid as u64) << 48)
+        );
+    }
+}
+
+/// Invalidate a single page for a given ASID.
+#[inline(always)]
+pub(crate) fn tlbi_page(asid: Asid, va: usize) {
+    if debug_enabled(DebugFlags::ConservativeTlbi) {
+        tlbi_all();
+        sync();
+        return;
+    }
+
+    let val: u64 = ((asid as u64) << 48) | ((va as u64 >> 12) & 0xffffffffffc);
+    unsafe {
+        asm!(
+            ".arch armv8.4-a",
+            "tlbi vae1os, {x}",
+            x = in(reg) val
+        );
+    }
+}
+
+/// Invalidate a range of pages for a given ASID.
+#[inline(always)]
+pub(crate) fn tlbi_range(asid: Asid, va: usize, len: usize) {
+    if debug_enabled(DebugFlags::ConservativeTlbi) {
+        tlbi_all();
+        sync();
+        return;
+    }
+
+    if len == 0 {
+        return;
+    }
+
+    let start_pg = va >> mmu::UAT_PGBIT;
+    let end_pg = (va + len + mmu::UAT_PGMSK) >> mmu::UAT_PGBIT;
+
+    let mut val: u64 = ((asid as u64) << 48) | (2 << 46) | (start_pg as u64 & 0x1fffffffff);
+    let pages = end_pg - start_pg;
+
+    if pages == 1 {
+        tlbi_page(asid, va);
+        return;
+    }
+
+    // Page count is always in units of 2
+    let num = ((pages + 1) >> 1) as u64;
+    // base: 5 bits
+    // exp: 2 bits
+    // pages = (base + 1) << (5 * exp + 1)
+    // 0:00000 ->                     2 pages = 2 << 0
+    // 0:11111 ->                32 * 2 pages = 2 << 5
+    // 1:00000 ->            1 * 32 * 2 pages = 2 << 5
+    // 1:11111 ->           32 * 32 * 2 pages = 2 << 10
+    // 2:00000 ->       1 * 32 * 32 * 2 pages = 2 << 10
+    // 2:11111 ->      32 * 32 * 32 * 2 pages = 2 << 15
+    // 3:00000 ->  1 * 32 * 32 * 32 * 2 pages = 2 << 15
+    // 3:11111 -> 32 * 32 * 32 * 32 * 2 pages = 2 << 20
+    let exp = min(3, (64 - num.leading_zeros()) / 5);
+    let bits = 5 * exp;
+    let mut base = (num + (1 << bits) - 1) >> bits;
+
+    val |= (exp as u64) << 44;
+
+    while base > 32 {
+        unsafe {
+            asm!(
+                ".arch armv8.4-a",
+                "tlbi rvae1os, {x}",
+                x = in(reg) val | (31 << 39)
+            );
+        }
+        base -= 32;
+    }
+
+    unsafe {
+        asm!(
+            ".arch armv8.4-a",
+            "tlbi rvae1os, {x}",
+            x = in(reg) val | ((base - 1) << 39)
+        );
+    }
+}
+
+/// Issue a memory barrier (`dsb sy`).
+#[inline(always)]
+pub(crate) fn sync() {
+    unsafe {
+        asm!("dsb sy");
+    }
+}
diff --git a/drivers/gpu/drm/asahi/microseq.rs b/drivers/gpu/drm/asahi/microseq.rs
new file mode 100644
index 000000000000..dca94ebc53a1
--- /dev/null
+++ b/drivers/gpu/drm/asahi/microseq.rs
@@ -0,0 +1,61 @@
+// SPDX-License-Identifier: GPL-2.0-only OR MIT
+
+//! GPU Micro operation sequence builder
+//!
+//! As part of a single job submisssion to the GPU, the GPU firmware interprets a sequence of
+//! commands that we call a "microsequence". These are responsible for setting up the job execution,
+//! timestamping the process, waiting for completion, tearing up any resources, and signaling
+//! completion to the driver via the event stamp mechanism.
+//!
+//! Although the microsequences used by the macOS driver are usually quite uniform and simple, the
+//! firmware actually implements enough operations to make this interpreter Turing-complete (!).
+//! Most of those aren't implemented yet, since we don't need them, but they could come in handy in
+//! the future to do strange things or work around firmware bugs...
+//!
+//! This module simply implements a collection of microsequence operations that can be appended to
+//! and later concatenated into one buffer, ready for firmware execution.
+
+use crate::fw::microseq;
+pub(crate) use crate::fw::microseq::*;
+use crate::fw::types::*;
+use kernel::prelude::*;
+
+/// MicroSequence object type, which is just an opaque byte array.
+pub(crate) type MicroSequence = GpuArray<u8>;
+
+/// MicroSequence builder.
+pub(crate) struct Builder {
+    ops: Vec<u8>,
+}
+
+impl Builder {
+    /// Create a new Builder object
+    pub(crate) fn new() -> Builder {
+        Builder { ops: Vec::new() }
+    }
+
+    /// Get the relative offset from the current pointer to a given target offset.
+    ///
+    /// Used for relative jumps.
+    pub(crate) fn offset_to(&self, target: i32) -> i32 {
+        target - self.ops.len() as i32
+    }
+
+    /// Add an operation to the end of the sequence.
+    pub(crate) fn add<T: microseq::Operation>(&mut self, op: T) -> Result<i32> {
+        let off = self.ops.len();
+        let p: *const T = &op;
+        let p: *const u8 = p as *const u8;
+        let s: &[u8] = unsafe { core::slice::from_raw_parts(p, core::mem::size_of::<T>()) };
+        self.ops.try_extend_from_slice(s)?;
+        Ok(off as i32)
+    }
+
+    /// Collect all submitted operations into a finalized GPU object.
+    pub(crate) fn build(self, alloc: &mut Allocator) -> Result<MicroSequence> {
+        let mut array = alloc.array_empty::<u8>(self.ops.len())?;
+
+        array.as_mut_slice().clone_from_slice(self.ops.as_slice());
+        Ok(array)
+    }
+}
diff --git a/drivers/gpu/drm/asahi/mmu.rs b/drivers/gpu/drm/asahi/mmu.rs
new file mode 100644
index 000000000000..226ca0b7c1d7
--- /dev/null
+++ b/drivers/gpu/drm/asahi/mmu.rs
@@ -0,0 +1,1249 @@
+// SPDX-License-Identifier: GPL-2.0-only OR MIT
+
+//! GPU UAT (MMU) management
+//!
+//! AGX GPUs use an MMU called the UAT, which is largely compatible with the ARM64 page table
+//! format. This module manages the global MMU structures, including a shared handoff structure
+//! that is used to coordinate VM management operations with the firmware, the TTBAT which points
+//! to currently active GPU VM contexts, as well as the individual `Vm` operations to map and
+//! unmap buffer objects into a single user or kernel address space.
+//!
+//! The actual page table management is delegated to the common kernel `io_pgtable` code.
+
+use core::fmt::Debug;
+use core::mem::size_of;
+use core::ptr::{addr_of_mut, NonNull};
+use core::sync::atomic::{fence, AtomicU32, AtomicU64, AtomicU8, Ordering};
+use core::time::Duration;
+
+use kernel::{
+    bindings, c_str, delay, device,
+    drm::mm,
+    error::{to_result, Result},
+    io_pgtable,
+    io_pgtable::{prot, AppleUAT, IoPageTable},
+    prelude::*,
+    sync::{smutex::Mutex, Guard},
+    sync::{Arc, LockClassKey, UniqueArc},
+    time,
+    types::ForeignOwnable,
+};
+
+use crate::debug::*;
+use crate::no_debug;
+use crate::{driver, fw, gem, hw, mem, slotalloc};
+
+const DEBUG_CLASS: DebugFlags = DebugFlags::Mmu;
+
+/// PPL magic number for the handoff region
+const PPL_MAGIC: u64 = 0x4b1d000000000002;
+
+/// Number of supported context entries in the TTBAT
+const UAT_NUM_CTX: usize = 64;
+/// First context available for users
+const UAT_USER_CTX_START: usize = 1;
+/// Number of available user contexts
+const UAT_USER_CTX: usize = UAT_NUM_CTX - UAT_USER_CTX_START;
+
+/// Number of bits in a page offset.
+pub(crate) const UAT_PGBIT: usize = 14;
+/// UAT page size.
+pub(crate) const UAT_PGSZ: usize = 1 << UAT_PGBIT;
+/// UAT page offset mask.
+pub(crate) const UAT_PGMSK: usize = UAT_PGSZ - 1;
+
+type Pte = AtomicU64;
+
+/// Number of PTEs per page.
+const UAT_NPTE: usize = UAT_PGSZ / size_of::<Pte>();
+
+/// UAT input address space (user)
+pub(crate) const UAT_IAS: usize = 39;
+/// "Fake" kernel UAT input address space (one page level lower)
+pub(crate) const UAT_IAS_KERN: usize = 36;
+
+/// Lower/user base VA
+const IOVA_USER_BASE: usize = UAT_PGSZ;
+/// Lower/user top VA
+const IOVA_USER_TOP: usize = (1 << UAT_IAS) - 1;
+/// Upper/kernel base VA
+// const IOVA_TTBR1_BASE: usize = 0xffffff8000000000;
+/// Driver-managed kernel base VA
+const IOVA_KERN_BASE: usize = 0xffffffa000000000;
+/// Driver-managed kernel top VA
+const IOVA_KERN_TOP: usize = 0xffffffafffffffff;
+
+const TTBR_VALID: u64 = 0x1; // BIT(0)
+const TTBR_ASID_SHIFT: usize = 48;
+
+const PTE_TABLE: u64 = 0x3; // BIT(0) | BIT(1)
+
+// Mapping protection types
+
+// Note: prot::CACHE means "cache coherency", which for UAT means *uncached*,
+// since uncached mappings from the GFX ASC side are cache coherent with the AP cache.
+// Not having that flag means *cached noncoherent*.
+
+/// Firmware MMIO R/W
+pub(crate) const PROT_FW_MMIO_RW: u32 =
+    prot::PRIV | prot::READ | prot::WRITE | prot::CACHE | prot::MMIO;
+/// Firmware MMIO R/O
+pub(crate) const PROT_FW_MMIO_RO: u32 = prot::PRIV | prot::READ | prot::CACHE | prot::MMIO;
+/// Firmware shared (uncached) RW
+pub(crate) const PROT_FW_SHARED_RW: u32 = prot::PRIV | prot::READ | prot::WRITE | prot::CACHE;
+/// Firmware shared (uncached) RO
+pub(crate) const PROT_FW_SHARED_RO: u32 = prot::PRIV | prot::READ | prot::CACHE;
+/// Firmware private (cached) RW
+pub(crate) const PROT_FW_PRIV_RW: u32 = prot::PRIV | prot::READ | prot::WRITE;
+/*
+/// Firmware private (cached) RO
+pub(crate) const PROT_FW_PRIV_RO: u32 = prot::PRIV | prot::READ;
+*/
+/// Firmware/GPU shared (uncached) RW
+pub(crate) const PROT_GPU_FW_SHARED_RW: u32 = prot::READ | prot::WRITE | prot::CACHE;
+/// Firmware/GPU shared (private) RW
+pub(crate) const PROT_GPU_FW_PRIV_RW: u32 = prot::READ | prot::WRITE;
+/// GPU shared/coherent RW
+pub(crate) const PROT_GPU_SHARED_RW: u32 = prot::READ | prot::WRITE | prot::CACHE | prot::NOEXEC;
+/// GPU shared/coherent RO
+pub(crate) const PROT_GPU_SHARED_RO: u32 = prot::READ | prot::CACHE | prot::NOEXEC;
+/// GPU shared/coherent WO
+pub(crate) const PROT_GPU_SHARED_WO: u32 = prot::WRITE | prot::CACHE | prot::NOEXEC;
+/*
+/// GPU private/noncoherent RW
+pub(crate) const PROT_GPU_PRIV_RW: u32 = prot::READ | prot::WRITE | prot::NOEXEC;
+/// GPU private/noncoherent RO
+pub(crate) const PROT_GPU_PRIV_RO: u32 = prot::READ | prot::NOEXEC;
+*/
+
+type PhysAddr = bindings::phys_addr_t;
+
+/// A pre-allocated memory region for UAT management
+struct UatRegion {
+    base: PhysAddr,
+    map: NonNull<core::ffi::c_void>,
+}
+
+/// It's safe to share UAT region records across threads.
+unsafe impl Send for UatRegion {}
+unsafe impl Sync for UatRegion {}
+
+/// Handoff region flush info structure
+#[repr(C)]
+struct FlushInfo {
+    state: AtomicU64,
+    addr: AtomicU64,
+    size: AtomicU64,
+}
+
+/// UAT Handoff region layout
+#[repr(C)]
+struct Handoff {
+    magic_ap: AtomicU64,
+    magic_fw: AtomicU64,
+
+    lock_ap: AtomicU8,
+    lock_fw: AtomicU8,
+    // Implicit padding: 2 bytes
+    turn: AtomicU32,
+    cur_slot: AtomicU32,
+    // Implicit padding: 4 bytes
+    flush: [FlushInfo; UAT_NUM_CTX + 1],
+
+    unk2: AtomicU8,
+    // Implicit padding: 7 bytes
+    unk3: AtomicU64,
+}
+
+const HANDOFF_SIZE: usize = size_of::<Handoff>();
+
+/// One VM slot in the TTBAT
+#[repr(C)]
+struct SlotTTBS {
+    ttb0: AtomicU64,
+    ttb1: AtomicU64,
+}
+
+const SLOTS_SIZE: usize = UAT_NUM_CTX * size_of::<SlotTTBS>();
+
+// We need at least page 0 (ttb0)
+const PAGETABLES_SIZE: usize = UAT_PGSZ;
+
+/// Inner data for a Vm instance. This is reference-counted by the outer Vm object.
+struct VmInner {
+    dev: driver::AsahiDevice,
+    is_kernel: bool,
+    min_va: usize,
+    max_va: usize,
+    page_table: AppleUAT<Uat>,
+    mm: mm::Allocator<(), MappingInner>,
+    uat_inner: Arc<UatInner>,
+    active_users: usize,
+    binding: Option<slotalloc::Guard<SlotInner>>,
+    bind_token: Option<slotalloc::SlotToken>,
+    id: u64,
+}
+
+impl VmInner {
+    /// Returns the slot index, if this VM is bound.
+    fn slot(&self) -> Option<u32> {
+        if self.is_kernel {
+            // The GFX ASC does not care about the ASID. Pick an arbitrary one.
+            // TODO: This needs to be a persistently reserved ASID once we integrate
+            // with the ARM64 kernel ASID machinery to avoid overlap.
+            Some(0)
+        } else {
+            // We don't check whether we lost the slot, which could cause unnecessary
+            // invalidations against another Vm. However, this situation should be very
+            // rare (e.g. a Vm lost its slot, which means 63 other Vms bound in the
+            // interim, and then it gets killed / drops its mappings without doing any
+            // final rendering). Anything doing active maps/unmaps is probably also
+            // rendering and therefore likely bound.
+            self.bind_token
+                .as_ref()
+                .map(|token| (token.last_slot() + UAT_USER_CTX_START as u32))
+        }
+    }
+
+    /// Returns the translation table base for this Vm
+    fn ttb(&self) -> u64 {
+        self.page_table.cfg().ttbr
+    }
+
+    /// Map an IOVA to the shifted address the underlying io_pgtable uses.
+    fn map_iova(&self, iova: usize, size: usize) -> Result<usize> {
+        if iova < self.min_va || (iova + size - 1) > self.max_va {
+            Err(EINVAL)
+        } else if self.is_kernel {
+            Ok(iova - self.min_va)
+        } else {
+            Ok(iova)
+        }
+    }
+
+    /// Map a contiguous range of virtual->physical pages.
+    fn map_pages(
+        &mut self,
+        mut iova: usize,
+        mut paddr: usize,
+        pgsize: usize,
+        pgcount: usize,
+        prot: u32,
+    ) -> Result<usize> {
+        let mut left = pgcount;
+        while left > 0 {
+            let mapped_iova = self.map_iova(iova, pgsize * left)?;
+            let mapped = self
+                .page_table
+                .map_pages(mapped_iova, paddr, pgsize, left, prot)?;
+            assert!(mapped <= left * pgsize);
+
+            left -= mapped / pgsize;
+            paddr += mapped;
+            iova += mapped;
+        }
+        Ok(pgcount * pgsize)
+    }
+
+    /// Unmap a contiguous range of pages.
+    fn unmap_pages(&mut self, mut iova: usize, pgsize: usize, pgcount: usize) -> Result<usize> {
+        let mut left = pgcount;
+        while left > 0 {
+            let mapped_iova = self.map_iova(iova, pgsize * left)?;
+            let unmapped = self.page_table.unmap_pages(mapped_iova, pgsize, left);
+            assert!(unmapped <= left * pgsize);
+
+            left -= unmapped / pgsize;
+            iova += unmapped;
+        }
+
+        Ok(pgcount * pgsize)
+    }
+
+    /// Map an `mm::Node` representing an mapping in VA space.
+    fn map_node(&mut self, node: &mm::Node<(), MappingInner>, prot: u32) -> Result {
+        let mut iova = node.start() as usize;
+        let sgt = node.sgt.as_ref().ok_or(EINVAL)?;
+
+        for range in sgt.iter() {
+            let addr = range.dma_address();
+            let len = range.dma_len();
+
+            if (addr | len | iova) & UAT_PGMSK != 0 {
+                dev_err!(
+                    self.dev,
+                    "MMU: Mapping {:#x}:{:#x} -> {:#x} is not page-aligned\n",
+                    addr,
+                    len,
+                    iova
+                );
+                return Err(EINVAL);
+            }
+
+            mod_dev_dbg!(
+                self.dev,
+                "MMU: map: {:#x}:{:#x} -> {:#x}\n",
+                addr,
+                len,
+                iova
+            );
+
+            self.map_pages(iova, addr, UAT_PGSZ, len >> UAT_PGBIT, prot)?;
+
+            iova += len;
+        }
+        Ok(())
+    }
+}
+
+/// Shared reference to a virtual memory address space ([`Vm`]).
+#[derive(Clone)]
+pub(crate) struct Vm {
+    id: u64,
+    file_id: u64,
+    inner: Arc<Mutex<VmInner>>,
+}
+no_debug!(Vm);
+
+/// Slot data for a [`Vm`] slot (nothing, we only care about the indices).
+pub(crate) struct SlotInner();
+
+impl slotalloc::SlotItem for SlotInner {
+    type Data = ();
+}
+
+/// Represents a single user of a binding of a [`Vm`] to a slot.
+///
+/// The number of users is counted, and the slot will be freed when it drops to 0.
+#[derive(Debug)]
+pub(crate) struct VmBind(Vm, u32);
+
+impl VmBind {
+    /// Returns the slot that this `Vm` is bound to.
+    pub(crate) fn slot(&self) -> u32 {
+        self.1
+    }
+}
+
+impl Drop for VmBind {
+    fn drop(&mut self) {
+        let mut inner = self.0.inner.lock();
+
+        assert_ne!(inner.active_users, 0);
+        inner.active_users -= 1;
+        mod_pr_debug!("MMU: slot {} active users {}\n", self.1, inner.active_users);
+        if inner.active_users == 0 {
+            inner.binding = None;
+        }
+    }
+}
+
+impl Clone for VmBind {
+    fn clone(&self) -> VmBind {
+        let mut inner = self.0.inner.lock();
+
+        inner.active_users += 1;
+        mod_pr_debug!("MMU: slot {} active users {}\n", self.1, inner.active_users);
+        VmBind(self.0.clone(), self.1)
+    }
+}
+
+/// Inner data required for an object mapping into a [`Vm`].
+pub(crate) struct MappingInner {
+    owner: Arc<Mutex<VmInner>>,
+    uat_inner: Arc<UatInner>,
+    prot: u32,
+    mapped_size: usize,
+    sgt: Option<gem::SGTable>,
+}
+
+/// An object mapping into a [`Vm`], which reserves the address range from use by other mappings.
+pub(crate) struct Mapping(mm::Node<(), MappingInner>);
+
+impl Mapping {
+    /// Returns the IOVA base of this mapping
+    pub(crate) fn iova(&self) -> usize {
+        self.0.start() as usize
+    }
+
+    /// Returns the size of this mapping in bytes
+    pub(crate) fn size(&self) -> usize {
+        self.0.mapped_size
+    }
+
+    /// Remap a cached mapping as uncached, then synchronously flush that range of VAs from the
+    /// coprocessor cache. This is required to safely unmap cached/private mappings.
+    fn remap_uncached_and_flush(&mut self) {
+        let mut owner = self.0.owner.lock();
+        mod_dev_dbg!(
+            owner.dev,
+            "MMU: remap as uncached {:#x}:{:#x}\n",
+            self.iova(),
+            self.size()
+        );
+
+        // The IOMMU API does not allow us to remap things in-place...
+        // just do an unmap and map again for now.
+        // Do not try to unmap guard page (-1)
+        if owner
+            .unmap_pages(self.iova(), UAT_PGSZ, self.size() >> UAT_PGBIT)
+            .is_err()
+        {
+            dev_err!(
+                owner.dev,
+                "MMU: unmap for remap {:#x}:{:#x} failed\n",
+                self.iova(),
+                self.size()
+            );
+        }
+
+        let prot = self.0.prot | prot::CACHE;
+        if owner.map_node(&self.0, prot).is_err() {
+            dev_err!(
+                owner.dev,
+                "MMU: remap {:#x}:{:#x} failed\n",
+                self.iova(),
+                self.size()
+            );
+        }
+
+        // If we don't have (and have never had) a VM slot, just return
+        let slot = match owner.slot() {
+            None => return,
+            Some(slot) => slot,
+        };
+
+        let flush_slot = if owner.is_kernel {
+            // If this is a kernel mapping, always flush on index 64
+            UAT_NUM_CTX as u32
+        } else {
+            // Otherwise, check if this slot is the active one, otherwise return
+            // Also check that we actually own this slot
+            let ttb = owner.ttb() | TTBR_VALID | (slot as u64) << TTBR_ASID_SHIFT;
+
+            let uat_inner = self.0.uat_inner.lock();
+            uat_inner.handoff().lock();
+            let cur_slot = uat_inner.handoff().current_slot();
+            let ttb_cur = uat_inner.ttbs()[slot as usize].ttb0.load(Ordering::Relaxed);
+            uat_inner.handoff().unlock();
+            if cur_slot == Some(slot) && ttb_cur == ttb {
+                slot
+            } else {
+                return;
+            }
+        };
+
+        // FIXME: There is a race here, though it'll probably never happen in practice.
+        // In theory, it's possible for the ASC to finish using our slot, whatever command
+        // it was processing to complete, the slot to be lost to another context, and the ASC
+        // to begin using it again with a different page table, thus faulting when it gets a
+        // flush request here. In practice, the chance of this happening is probably vanishingly
+        // small, as all 62 other slots would have to be recycled or in use before that slot can
+        // be reused, and the ASC using user contexts at all is very rare.
+
+        // Still, the locking around UAT/Handoff/TTBs should probably be redesigned to better
+        // model the interactions with the firmware and avoid these races.
+        // Possibly TTB changes should be tied to slot locks:
+
+        // Flush:
+        //  - Can early check handoff here (no need to lock).
+        //      If user slot and it doesn't match the active ASC slot,
+        //      we can elide the flush as the ASC guarantees it flushes
+        //      TLBs/caches when it switches context. We just need a
+        //      barrier to ensure ordering.
+        //  - Lock TTB slot
+        //      - If user ctx:
+        //          - Lock handoff AP-side
+        //              - Lock handoff dekker
+        //                  - Check TTB & handoff cur ctx
+        //      - Perform flush if necessary
+        //          - This implies taking the fwring lock
+        //
+        // TTB change:
+        //  - lock TTB slot
+        //      - lock handoff AP-side
+        //          - lock handoff dekker
+        //              change TTB
+
+        // Lock this flush slot, and write the range to it
+        let flush = self.0.uat_inner.lock_flush(flush_slot);
+        let pages = self.size() >> UAT_PGBIT;
+        flush.begin_flush(self.iova() as u64, self.size() as u64);
+        if pages >= 0x10000 {
+            dev_err!(owner.dev, "MMU: Flush too big ({:#x} pages))\n", pages);
+        }
+
+        let cmd = fw::channels::FwCtlMsg {
+            addr: fw::types::U64(self.iova() as u64),
+            unk_8: 0,
+            slot: flush_slot,
+            page_count: pages as u16,
+            unk_12: 2, // ?
+        };
+
+        // Tell the firmware to do a cache flush
+        if let Err(e) = owner.dev.data().gpu.fwctl(cmd) {
+            dev_err!(
+                owner.dev,
+                "MMU: ASC cache flush {:#x}:{:#x} failed (err: {:?})\n",
+                self.iova(),
+                self.size(),
+                e
+            );
+        }
+
+        // Finish the flush
+        flush.end_flush();
+
+        // Slot is unlocked here
+    }
+}
+
+impl Drop for Mapping {
+    fn drop(&mut self) {
+        // This is the main unmap function for UAT mappings.
+        // The sequence of operations here is finicky, due to the interaction
+        // between cached GFX ASC mappings and the page tables. These mappings
+        // always have to be flushed from the cache before being unmapped.
+
+        // For uncached mappings, just unmapping and flushing the TLB is sufficient.
+
+        // For cached mappings, this is the required sequence:
+        // 1. Remap it as uncached
+        // 2. Flush the TLB range
+        // 3. If kernel VA mapping OR user VA mapping and handoff.current_slot() == slot:
+        //    a. Take a lock for this slot
+        //    b. Write the flush range to the right context slot in handoff area
+        //    c. Issue a cache invalidation request via FwCtl queue
+        //    d. Poll for completion via queue
+        //    e. Check for completion flag in the handoff area
+        //    f. Drop the lock
+        // 4. Unmap
+        // 5. Flush the TLB range again
+
+        // prot::CACHE means "cache coherent" which means *uncached* here.
+        if self.0.prot & prot::CACHE == 0 {
+            self.remap_uncached_and_flush();
+        }
+
+        let mut owner = self.0.owner.lock();
+        mod_dev_dbg!(
+            owner.dev,
+            "MMU: unmap {:#x}:{:#x}\n",
+            self.iova(),
+            self.size()
+        );
+
+        if owner
+            .unmap_pages(self.iova(), UAT_PGSZ, self.size() >> UAT_PGBIT)
+            .is_err()
+        {
+            dev_err!(
+                owner.dev,
+                "MMU: unmap {:#x}:{:#x} failed\n",
+                self.iova(),
+                self.size()
+            );
+        }
+
+        if let Some(asid) = owner.slot() {
+            mem::tlbi_range(asid as u8, self.iova(), self.size());
+            mod_dev_dbg!(
+                owner.dev,
+                "MMU: flush range: asid={:#x} start={:#x} len={:#x}\n",
+                asid,
+                self.iova(),
+                self.size()
+            );
+            mem::sync();
+        }
+    }
+}
+
+/// Shared UAT global data structures
+struct UatShared {
+    handoff_rgn: UatRegion,
+    ttbs_rgn: UatRegion,
+}
+
+impl UatShared {
+    /// Returns the handoff region area
+    fn handoff(&self) -> &Handoff {
+        // SAFETY: pointer is non-null per the type invariant
+        unsafe { (self.handoff_rgn.map.as_ptr() as *mut Handoff).as_ref() }.unwrap()
+    }
+
+    /// Returns the TTBAT area
+    fn ttbs(&self) -> &[SlotTTBS; UAT_NUM_CTX] {
+        // SAFETY: pointer is non-null per the type invariant
+        unsafe { (self.ttbs_rgn.map.as_ptr() as *mut [SlotTTBS; UAT_NUM_CTX]).as_ref() }.unwrap()
+    }
+}
+
+// SAFETY: Nothing here is unsafe to send across threads.
+unsafe impl Send for UatShared {}
+
+/// Inner data for the top-level UAT instance.
+struct UatInner {
+    shared: Mutex<UatShared>,
+    handoff_flush: [Mutex<HandoffFlush>; UAT_NUM_CTX + 1],
+}
+
+impl UatInner {
+    /// Take the lock on the shared data and return the guard.
+    fn lock(&self) -> Guard<'_, Mutex<UatShared>> {
+        self.shared.lock()
+    }
+
+    /// Take a lock on a handoff flush slot and return the guard.
+    fn lock_flush(&self, slot: u32) -> Guard<'_, Mutex<HandoffFlush>> {
+        self.handoff_flush[slot as usize].lock()
+    }
+}
+
+/// Top-level UAT manager object
+pub(crate) struct Uat {
+    dev: driver::AsahiDevice,
+    cfg: &'static hw::HwConfig,
+    pagetables_rgn: UatRegion,
+
+    inner: Arc<UatInner>,
+    slots: slotalloc::SlotAllocator<SlotInner>,
+
+    kernel_vm: Vm,
+    _kernel_lower_vm: Vm,
+}
+
+impl Drop for UatRegion {
+    fn drop(&mut self) {
+        // SAFETY: the pointer is valid by the type invariant
+        unsafe { bindings::memunmap(self.map.as_ptr()) };
+    }
+}
+
+impl Handoff {
+    /// Lock the handoff region from firmware access
+    fn lock(&self) {
+        self.lock_ap.store(1, Ordering::Relaxed);
+        fence(Ordering::SeqCst);
+
+        while self.lock_fw.load(Ordering::Relaxed) != 0 {
+            if self.turn.load(Ordering::Relaxed) != 0 {
+                self.lock_ap.store(0, Ordering::Relaxed);
+                while self.turn.load(Ordering::Relaxed) != 0 {}
+                self.lock_ap.store(1, Ordering::Relaxed);
+                fence(Ordering::SeqCst);
+            }
+        }
+        fence(Ordering::Acquire);
+    }
+
+    /// Unlock the handoff region, allowing firmware access
+    fn unlock(&self) {
+        self.turn.store(1, Ordering::Relaxed);
+        self.lock_ap.store(0, Ordering::Release);
+    }
+
+    /// Returns the current Vm slot mapped by the firmware for lower/unprivileged access, if any.
+    fn current_slot(&self) -> Option<u32> {
+        let slot = self.cur_slot.load(Ordering::Relaxed);
+        if slot == 0 || slot == u32::MAX {
+            None
+        } else {
+            Some(slot)
+        }
+    }
+
+    /// Initialize the handoff region
+    fn init(&self) -> Result {
+        self.magic_ap.store(PPL_MAGIC, Ordering::Relaxed);
+        self.cur_slot.store(0, Ordering::Relaxed);
+        self.unk3.store(0, Ordering::Relaxed);
+        fence(Ordering::SeqCst);
+
+        let timeout = time::ktime_get() + Duration::from_millis(1000);
+
+        self.lock();
+        while time::ktime_get() < timeout {
+            if self.magic_fw.load(Ordering::Relaxed) == PPL_MAGIC {
+                break;
+            } else {
+                self.unlock();
+                delay::coarse_sleep(Duration::from_millis(10));
+                self.lock();
+            }
+        }
+
+        if self.magic_fw.load(Ordering::Relaxed) != PPL_MAGIC {
+            self.unlock();
+            pr_err!("Handoff: Failed to initialize (firmware not running?)\n");
+            return Err(EIO);
+        }
+
+        self.unlock();
+
+        for i in 0..=UAT_NUM_CTX {
+            self.flush[i].state.store(0, Ordering::Relaxed);
+            self.flush[i].addr.store(0, Ordering::Relaxed);
+            self.flush[i].size.store(0, Ordering::Relaxed);
+        }
+        fence(Ordering::SeqCst);
+        Ok(())
+    }
+}
+
+/// Represents a single flush info slot in the handoff region.
+///
+/// # Invariants
+/// The pointer is valid and there is no aliasing HandoffFlush instance.
+struct HandoffFlush(*const FlushInfo);
+
+// SAFETY: These pointers are safe to send across threads.
+unsafe impl Send for HandoffFlush {}
+
+impl HandoffFlush {
+    /// Set up a flush operation for the coprocessor
+    fn begin_flush(&self, start: u64, size: u64) {
+        let flush = unsafe { self.0.as_ref().unwrap() };
+
+        let state = flush.state.load(Ordering::Relaxed);
+        if state != 0 {
+            pr_err!("Handoff: expected flush state 0, got {}\n", state);
+        }
+        flush.addr.store(start, Ordering::Relaxed);
+        flush.size.store(size, Ordering::Relaxed);
+        flush.state.store(1, Ordering::Relaxed);
+    }
+
+    /// Complete a flush operation for the coprocessor
+    fn end_flush(&self) {
+        let flush = unsafe { self.0.as_ref().unwrap() };
+        let state = flush.state.load(Ordering::Relaxed);
+        if state != 2 {
+            pr_err!("Handoff: expected flush state 2, got {}\n", state);
+        }
+        flush.state.store(0, Ordering::Relaxed);
+    }
+}
+
+// We do not implement FlushOps, since we flush manually in this module after
+// page table operations. Just provide dummy implementations.
+impl io_pgtable::FlushOps for Uat {
+    type Data = ();
+
+    fn tlb_flush_all(_data: <Self::Data as ForeignOwnable>::Borrowed<'_>) {}
+    fn tlb_flush_walk(
+        _data: <Self::Data as ForeignOwnable>::Borrowed<'_>,
+        _iova: usize,
+        _size: usize,
+        _granule: usize,
+    ) {
+    }
+    fn tlb_add_page(
+        _data: <Self::Data as ForeignOwnable>::Borrowed<'_>,
+        _iova: usize,
+        _granule: usize,
+    ) {
+    }
+}
+
+static LOCK_KEY: LockClassKey = LockClassKey::new();
+
+impl Vm {
+    /// Create a new virtual memory address space
+    fn new(
+        dev: driver::AsahiDevice,
+        uat_inner: Arc<UatInner>,
+        cfg: &'static hw::HwConfig,
+        is_kernel: bool,
+        id: u64,
+        file_id: u64,
+    ) -> Result<Vm> {
+        let page_table = AppleUAT::new(
+            &dev,
+            io_pgtable::Config {
+                pgsize_bitmap: UAT_PGSZ,
+                ias: if is_kernel { UAT_IAS_KERN } else { UAT_IAS },
+                oas: cfg.uat_oas,
+                coherent_walk: true,
+                quirks: 0,
+            },
+            (),
+        )?;
+        let min_va = if is_kernel {
+            IOVA_KERN_BASE
+        } else {
+            IOVA_USER_BASE
+        };
+        let max_va = if is_kernel {
+            IOVA_KERN_TOP
+        } else {
+            IOVA_USER_TOP
+        };
+
+        let mm = mm::Allocator::new(
+            min_va as u64,
+            (max_va - min_va + 1) as u64,
+            (),
+            c_str!("asahi Vm"),
+            &LOCK_KEY,
+        )?;
+
+        Ok(Vm {
+            id,
+            file_id,
+            inner: Arc::try_new(Mutex::new(VmInner {
+                dev,
+                min_va,
+                max_va,
+                is_kernel,
+                page_table,
+                mm,
+                uat_inner,
+                binding: None,
+                bind_token: None,
+                active_users: 0,
+                id,
+            }))?,
+        })
+    }
+
+    /// Get the translation table base for this Vm
+    fn ttb(&self) -> u64 {
+        self.inner.lock().ttb()
+    }
+
+    /// Map a GEM object (using its `SGTable`) into this Vm at a free address.
+    pub(crate) fn map(&self, size: usize, sgt: gem::SGTable) -> Result<Mapping> {
+        let mut inner = self.inner.lock();
+
+        let uat_inner = inner.uat_inner.clone();
+        let node = inner.mm.insert_node(
+            MappingInner {
+                owner: self.inner.clone(),
+                uat_inner,
+                prot: PROT_FW_SHARED_RW,
+                sgt: Some(sgt),
+                mapped_size: size,
+            },
+            (size + UAT_PGSZ) as u64, // Add guard page
+        )?;
+
+        inner.map_node(&node, PROT_FW_SHARED_RW)?;
+        Ok(Mapping(node))
+    }
+
+    /// Map a GEM object (using its `SGTable`) into this Vm at a free address in a given range.
+    #[allow(clippy::too_many_arguments)]
+    pub(crate) fn map_in_range(
+        &self,
+        size: usize,
+        sgt: gem::SGTable,
+        alignment: u64,
+        start: u64,
+        end: u64,
+        prot: u32,
+        guard: bool,
+    ) -> Result<Mapping> {
+        let mut inner = self.inner.lock();
+
+        let uat_inner = inner.uat_inner.clone();
+        let node = inner.mm.insert_node_in_range(
+            MappingInner {
+                owner: self.inner.clone(),
+                uat_inner,
+                prot,
+                sgt: Some(sgt),
+                mapped_size: size,
+            },
+            (size + if guard { UAT_PGSZ } else { 0 }) as u64, // Add guard page
+            alignment,
+            0,
+            start,
+            end,
+            mm::InsertMode::Best,
+        )?;
+
+        inner.map_node(&node, prot)?;
+        Ok(Mapping(node))
+    }
+
+    /// Map a GEM object (using its `SGTable`) into this Vm at a specific address.
+    #[allow(clippy::too_many_arguments)]
+    pub(crate) fn map_at(
+        &self,
+        addr: u64,
+        size: usize,
+        sgt: gem::SGTable,
+        prot: u32,
+        guard: bool,
+    ) -> Result<Mapping> {
+        let mut inner = self.inner.lock();
+
+        let uat_inner = inner.uat_inner.clone();
+        let node = inner.mm.reserve_node(
+            MappingInner {
+                owner: self.inner.clone(),
+                uat_inner,
+                prot,
+                sgt: Some(sgt),
+                mapped_size: size,
+            },
+            addr,
+            (size + if guard { UAT_PGSZ } else { 0 }) as u64, // Add guard page
+            0,
+        )?;
+
+        inner.map_node(&node, prot)?;
+        Ok(Mapping(node))
+    }
+
+    /// Add a direct MMIO mapping to this Vm at a free address.
+    pub(crate) fn map_io(&self, phys: usize, size: usize, rw: bool) -> Result<Mapping> {
+        let prot = if rw { PROT_FW_MMIO_RW } else { PROT_FW_MMIO_RO };
+        let mut inner = self.inner.lock();
+
+        let uat_inner = inner.uat_inner.clone();
+        let node = inner.mm.insert_node(
+            MappingInner {
+                owner: self.inner.clone(),
+                uat_inner,
+                prot,
+                sgt: None,
+                mapped_size: size,
+            },
+            (size + UAT_PGSZ) as u64, // Add guard page
+        )?;
+
+        let iova = node.start() as usize;
+
+        if (phys | size | iova) & UAT_PGMSK != 0 {
+            dev_err!(
+                inner.dev,
+                "MMU: Mapping {:#x}:{:#x} -> {:#x} is not page-aligned\n",
+                phys,
+                size,
+                iova
+            );
+            return Err(EINVAL);
+        }
+
+        dev_info!(
+            inner.dev,
+            "MMU: IO map: {:#x}:{:#x} -> {:#x}\n",
+            phys,
+            size,
+            iova
+        );
+
+        inner.map_pages(iova, phys, UAT_PGSZ, size >> UAT_PGBIT, prot)?;
+
+        Ok(Mapping(node))
+    }
+
+    /// Returns the unique ID of this Vm
+    pub(crate) fn id(&self) -> u64 {
+        self.id
+    }
+
+    /// Returns the unique File ID of the owner of this Vm
+    pub(crate) fn file_id(&self) -> u64 {
+        self.file_id
+    }
+}
+
+impl Drop for VmInner {
+    fn drop(&mut self) {
+        assert_eq!(self.active_users, 0);
+
+        mod_pr_debug!(
+            "VmInner::Drop [{}]: bind_token={:?}\n",
+            self.id,
+            self.bind_token
+        );
+
+        // Make sure this VM is not mapped to a TTB if it was
+        if let Some(token) = self.bind_token.take() {
+            let idx = (token.last_slot() as usize) + UAT_USER_CTX_START;
+            let ttb = self.ttb() | TTBR_VALID | (idx as u64) << TTBR_ASID_SHIFT;
+
+            let uat_inner = self.uat_inner.lock();
+            uat_inner.handoff().lock();
+            let handoff_cur = uat_inner.handoff().current_slot();
+            let ttb_cur = uat_inner.ttbs()[idx].ttb0.load(Ordering::SeqCst);
+            let inval = ttb_cur == ttb;
+            if inval {
+                if handoff_cur == Some(idx as u32) {
+                    pr_err!(
+                        "VmInner::drop owning slot {}, but it is currently in use by the ASC?\n",
+                        idx
+                    );
+                }
+                uat_inner.ttbs()[idx].ttb0.store(0, Ordering::SeqCst);
+            }
+            uat_inner.handoff().unlock();
+            core::mem::drop(uat_inner);
+
+            // In principle we dropped all the Mappings already, but we might as
+            // well play it safe and invalidate the whole ASID.
+            if inval {
+                mod_pr_debug!(
+                    "VmInner::Drop [{}]: need inval for ASID {:#x}\n",
+                    self.id,
+                    idx
+                );
+                mem::tlbi_asid(idx as u8);
+                mem::sync();
+            }
+        }
+    }
+}
+
+impl Uat {
+    /// Map a bootloader-preallocated memory region
+    fn map_region(
+        dev: &dyn device::RawDevice,
+        name: &CStr,
+        size: usize,
+        cached: bool,
+    ) -> Result<UatRegion> {
+        let rdev = dev.raw_device();
+
+        let mut res = core::mem::MaybeUninit::<bindings::resource>::uninit();
+
+        let res = unsafe {
+            let idx = bindings::of_property_match_string(
+                (*rdev).of_node,
+                c_str!("memory-region-names").as_char_ptr(),
+                name.as_char_ptr(),
+            );
+            to_result(idx)?;
+
+            let np = bindings::of_parse_phandle(
+                (*rdev).of_node,
+                c_str!("memory-region").as_char_ptr(),
+                idx,
+            );
+            if np.is_null() {
+                dev_err!(dev, "Missing {} region\n", name);
+                return Err(EINVAL);
+            }
+            let ret = bindings::of_address_to_resource(np, 0, res.as_mut_ptr());
+            bindings::of_node_put(np);
+
+            if ret < 0 {
+                dev_err!(dev, "Failed to get {} region\n", name);
+                to_result(ret)?
+            }
+
+            res.assume_init()
+        };
+
+        let rgn_size: usize = unsafe { bindings::resource_size(&res) } as usize;
+
+        if size > rgn_size {
+            dev_err!(
+                dev,
+                "Region {} is too small (expected {}, got {})\n",
+                name,
+                size,
+                rgn_size
+            );
+            return Err(ENOMEM);
+        }
+
+        let flags = if cached {
+            bindings::MEMREMAP_WB
+        } else {
+            bindings::MEMREMAP_WC
+        };
+        let map = unsafe { bindings::memremap(res.start, rgn_size, flags.into()) };
+        let map = NonNull::new(map);
+
+        match map {
+            None => {
+                dev_err!(dev, "Failed to remap {} region\n", name);
+                Err(ENOMEM)
+            }
+            Some(map) => Ok(UatRegion {
+                base: res.start,
+                map,
+            }),
+        }
+    }
+
+    /// Returns a view into the root kernel (upper half) page table
+    fn kpt0(&self) -> &[Pte; UAT_NPTE] {
+        // SAFETY: pointer is non-null per the type invariant
+        unsafe { (self.pagetables_rgn.map.as_ptr() as *mut [Pte; UAT_NPTE]).as_ref() }.unwrap()
+    }
+
+    /// Returns a reference to the global kernel (upper half) `Vm`
+    pub(crate) fn kernel_vm(&self) -> &Vm {
+        &self.kernel_vm
+    }
+
+    /// Returns the base physical address of the TTBAT region.
+    pub(crate) fn ttb_base(&self) -> u64 {
+        let inner = self.inner.lock();
+
+        inner.ttbs_rgn.base
+    }
+
+    /// Binds a `Vm` to a slot, preferring the last used one.
+    pub(crate) fn bind(&self, vm: &Vm) -> Result<VmBind> {
+        let mut inner = vm.inner.lock();
+
+        if inner.binding.is_none() {
+            assert_eq!(inner.active_users, 0);
+
+            let slot = self.slots.get(inner.bind_token)?;
+            if slot.changed() {
+                mod_pr_debug!("Vm Bind [{}]: bind_token={:?}\n", vm.id, slot.token(),);
+                let idx = (slot.slot() as usize) + UAT_USER_CTX_START;
+                let ttb = inner.ttb() | TTBR_VALID | (idx as u64) << TTBR_ASID_SHIFT;
+
+                let uat_inner = self.inner.lock();
+                let ttbs = uat_inner.ttbs();
+                uat_inner.handoff().lock();
+                if uat_inner.handoff().current_slot() == Some(idx as u32) {
+                    pr_err!(
+                        "Vm::bind to slot {}, but it is currently in use by the ASC?\n",
+                        idx
+                    );
+                }
+                ttbs[idx].ttb0.store(ttb, Ordering::Relaxed);
+                ttbs[idx].ttb1.store(0, Ordering::Relaxed);
+                uat_inner.handoff().unlock();
+                core::mem::drop(uat_inner);
+
+                // Make sure all TLB entries from the previous owner of this ASID are gone
+                mem::tlbi_asid(idx as u8);
+                mem::sync();
+            }
+
+            inner.bind_token = Some(slot.token());
+            inner.binding = Some(slot);
+        }
+
+        inner.active_users += 1;
+
+        let slot = inner.binding.as_ref().unwrap().slot() + UAT_USER_CTX_START as u32;
+        mod_pr_debug!("MMU: slot {} active users {}\n", slot, inner.active_users);
+        Ok(VmBind(vm.clone(), slot))
+    }
+
+    /// Creates a new `Vm` linked to this UAT.
+    pub(crate) fn new_vm(&self, id: u64, file_id: u64) -> Result<Vm> {
+        Vm::new(
+            self.dev.clone(),
+            self.inner.clone(),
+            self.cfg,
+            false,
+            id,
+            file_id,
+        )
+    }
+
+    /// Creates the reference-counted inner data for a new `Uat` instance.
+    #[inline(never)]
+    fn make_inner(dev: &driver::AsahiDevice) -> Result<Arc<UatInner>> {
+        let handoff_rgn = Self::map_region(dev, c_str!("handoff"), HANDOFF_SIZE, false)?;
+        let ttbs_rgn = Self::map_region(dev, c_str!("ttbs"), SLOTS_SIZE, false)?;
+
+        dev_info!(dev, "MMU: Initializing kernel page table\n");
+
+        let mut inner = UniqueArc::<UatInner>::try_new_uninit()?;
+        let ptr = inner.as_mut_ptr();
+
+        Ok(unsafe {
+            let handoff = &(handoff_rgn.map.as_ptr() as *mut Handoff).as_ref().unwrap();
+
+            for i in 0..UAT_NUM_CTX + 1 {
+                addr_of_mut!((*ptr).handoff_flush[i])
+                    .write(Mutex::new(HandoffFlush(&handoff.flush[i])));
+            }
+
+            addr_of_mut!((*ptr).shared).write(Mutex::new(UatShared {
+                handoff_rgn,
+                ttbs_rgn,
+            }));
+
+            inner.assume_init()
+        }
+        .into())
+    }
+
+    /// Creates a new `Uat` instance given the relevant hardware config.
+    #[inline(never)]
+    pub(crate) fn new(dev: &driver::AsahiDevice, cfg: &'static hw::HwConfig) -> Result<Self> {
+        dev_info!(dev, "MMU: Initializing...\n");
+
+        let inner = Self::make_inner(dev)?;
+
+        let pagetables_rgn = Self::map_region(dev, c_str!("pagetables"), PAGETABLES_SIZE, true)?;
+
+        dev_info!(dev, "MMU: Creating kernel page tables\n");
+        let kernel_lower_vm = Vm::new(dev.clone(), inner.clone(), cfg, false, 1, 0)?;
+        let kernel_vm = Vm::new(dev.clone(), inner.clone(), cfg, true, 0, 0)?;
+
+        dev_info!(dev, "MMU: Kernel page tables created\n");
+
+        let ttb0 = kernel_lower_vm.ttb();
+        let ttb1 = kernel_vm.ttb();
+
+        let uat = Self {
+            dev: dev.clone(),
+            cfg,
+            pagetables_rgn,
+            kernel_vm,
+            _kernel_lower_vm: kernel_lower_vm,
+            inner,
+            slots: slotalloc::SlotAllocator::new(UAT_USER_CTX as u32, (), |_inner, _slot| {
+                SlotInner()
+            })?,
+        };
+
+        let inner = uat.inner.lock();
+
+        inner.handoff().init()?;
+
+        dev_info!(dev, "MMU: Initializing TTBs\n");
+
+        inner.handoff().lock();
+
+        let ttbs = inner.ttbs();
+
+        ttbs[0].ttb0.store(ttb0 | TTBR_VALID, Ordering::Relaxed);
+        ttbs[0]
+            .ttb1
+            .store(uat.pagetables_rgn.base | TTBR_VALID, Ordering::Relaxed);
+
+        for ctx in &ttbs[1..] {
+            ctx.ttb0.store(0, Ordering::Relaxed);
+            ctx.ttb1.store(0, Ordering::Relaxed);
+        }
+
+        inner.handoff().unlock();
+
+        core::mem::drop(inner);
+
+        uat.kpt0()[2].store(ttb1 | PTE_TABLE, Ordering::Relaxed);
+
+        dev_info!(dev, "MMU: initialized\n");
+
+        Ok(uat)
+    }
+}
+
+impl Drop for Uat {
+    fn drop(&mut self) {
+        // Unmap what we mapped
+        self.kpt0()[2].store(0, Ordering::Relaxed);
+
+        // Make sure we flush the TLBs
+        fence(Ordering::SeqCst);
+        mem::tlbi_all();
+        mem::sync();
+    }
+}
diff --git a/drivers/gpu/drm/asahi/object.rs b/drivers/gpu/drm/asahi/object.rs
new file mode 100644
index 000000000000..449899b88181
--- /dev/null
+++ b/drivers/gpu/drm/asahi/object.rs
@@ -0,0 +1,704 @@
+// SPDX-License-Identifier: GPL-2.0-only OR MIT
+
+//! Asahi GPU object model
+//!
+//! The AGX GPU includes a coprocessor that uses a large number of shared memory structures to
+//! communicate with the driver. These structures contain GPU VA pointers to each other, which are
+//! directly dereferenced by the firmware and are expected to always be valid for the usage
+//! lifetime of the containing struct (which is an implicit contract, not explicitly managed).
+//! Any faults cause an unrecoverable firmware crash, requiring a full system reboot.
+//!
+//! In order to manage this complexity safely, we implement a GPU object model using Rust's type
+//! system to enforce GPU object lifetime relationships. GPU objects represent an allocated piece
+//! of memory of a given type, mapped to the GPU (and usually also the CPU). On the CPU side,
+//! these objects are associated with a pure Rust structure that contains the objects it depends
+//! on (or references to them). This allows us to map Rust lifetimes into the GPU object model
+//! system. Then, GPU VA pointers also inherit those lifetimes, which means the Rust borrow checker
+//! can ensure that all pointers are assigned an address that is guaranteed to outlive the GPU
+//! object it points to.
+//!
+//! Since the firmware object model does have self-referencing pointers (and there is of course no
+//! underlying revocability mechanism to make it safe), we must have an escape hatch. GPU pointers
+//! can be weak pointers, which do not enforce lifetimes. In those cases, it is the user's
+//! responsibility to ensure that lifetime requirements are met.
+//!
+//! In other words, the model is necessarily leaky and there is no way to fully map Rust safety to
+//! GPU firmware object safety. The goal of the model is to make it easy to model the lifetimes of
+//! GPU objects and have the compiler help in avoiding mistakes, rather than to guarantee safety
+//! 100% of the time as would be the case for CPU-side Rust code.
+
+// TODO: There is a fundamental soundness issue with sharing memory with the GPU (that even affects
+// C code too). Since the GPU is free to mutate that memory at any time, normal reference invariants
+// cannot be enforced on the CPU side. For example, the compiler could perform an optimization that
+// assumes that a given memory location does not change between two reads, and causes UB otherwise,
+// and then the GPU could mutate that memory out from under the CPU.
+//
+// For cases where we *expect* this to happen, we use atomic types, which avoid this issue. However,
+// doing so for every single field of every type is a non-starter. Right now, there seems to be no
+// good solution for this that does not come with significant performance or ergonomics downsides.
+//
+// In *practice* we are almost always only writing GPU memory, and only reading from atomics, so the
+// chances of this actually triggering UB (e.g. a security issue that can be triggered from the GPU
+// side) due to a compiler optimization are very slim.
+//
+// Further discussion: https://github.com/rust-lang/unsafe-code-guidelines/issues/152
+
+use kernel::{error::code::*, prelude::*};
+
+use alloc::boxed::Box;
+use core::fmt;
+use core::fmt::Debug;
+use core::fmt::Formatter;
+use core::marker::PhantomData;
+use core::mem::MaybeUninit;
+use core::num::NonZeroU64;
+use core::ops::{Deref, DerefMut, Index, IndexMut};
+use core::{mem, ptr, slice};
+
+use crate::alloc::Allocation;
+use crate::debug::*;
+use crate::fw::types::Zeroed;
+
+const DEBUG_CLASS: DebugFlags = DebugFlags::Object;
+
+/// A GPU-side strong pointer, which is a 64-bit non-zero VA with an associated lifetime.
+///
+/// In rare cases these pointers are not aligned, so this is `packed(1)`.
+#[repr(C, packed(1))]
+pub(crate) struct GpuPointer<'a, T: ?Sized>(NonZeroU64, PhantomData<&'a T>);
+
+impl<'a, T: ?Sized> GpuPointer<'a, T> {
+    /// Logical OR the pointer with an arbitrary `u64`. This is used when GPU struct fields contain
+    /// misc flag fields in the upper bits. The lifetime is retained. This is GPU-unsafe in
+    /// principle, but we assert that only non-implemented address bits are touched, which is safe
+    /// for pointers used by the GPU (not by firmware).
+    pub(crate) fn or(&self, other: u64) -> GpuPointer<'a, T> {
+        // This will fail for kernel-half pointers, which should not be ORed.
+        assert_eq!(self.0.get() & other, 0);
+        // Assert that we only touch the high bits.
+        assert_eq!(other & 0xffffffffff, 0);
+        GpuPointer(self.0 | other, PhantomData)
+    }
+
+    /// Add an arbitrary offset to the pointer. This is not safe (from the GPU perspective), and
+    /// should only be used via the `inner_ptr` macro to get pointers to inner fields, hence we mark
+    /// it `unsafe` to discourage direct use.
+    // NOTE: The third argument is a type inference hack.
+    pub(crate) unsafe fn offset<U>(&self, off: usize, _: *const U) -> GpuPointer<'a, U> {
+        GpuPointer::<'a, U>(
+            NonZeroU64::new(self.0.get() + (off as u64)).unwrap(),
+            PhantomData,
+        )
+    }
+}
+
+impl<'a, T: ?Sized> Debug for GpuPointer<'a, T> {
+    fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
+        let val = self.0;
+        f.write_fmt(format_args!("{:#x} ({})", val, core::any::type_name::<T>()))
+    }
+}
+
+/// Take a pointer to a sub-field within a structure pointed to by a GpuPointer, keeping the
+/// lifetime.
+#[macro_export]
+macro_rules! inner_ptr {
+    ($gpuva:expr, $($f:tt)*) => ({
+        // This mirrors kernel::offset_of(), except we use type inference to avoid having to know
+        // the type of the pointer explicitly.
+        fn uninit_from<'a, T: GpuStruct>(_: GpuPointer<'a, T>) -> core::mem::MaybeUninit<T::Raw<'static>> {
+            core::mem::MaybeUninit::uninit()
+        }
+        let tmp = uninit_from($gpuva);
+        let outer = tmp.as_ptr();
+        // SAFETY: The pointer is valid and aligned, just not initialised; `addr_of` ensures that
+        // we don't actually read from `outer` (which would be UB) nor create an intermediate
+        // reference.
+        let p: *const _ = unsafe { core::ptr::addr_of!((*outer).$($f)*) };
+        let inner = p as *const u8;
+        // SAFETY: The two pointers are within the same allocation block.
+        let off = unsafe { inner.offset_from(outer as *const u8) };
+        // SAFETY: The resulting pointer is guaranteed to point to valid memory within the outer
+        // object.
+        unsafe { $gpuva.offset(off.try_into().unwrap(), p) }
+    })
+}
+
+/// A GPU-side weak pointer, which is a 64-bit non-zero VA with no lifetime.
+///
+/// In rare cases these pointers are not aligned, so this is `packed(1)`.
+#[repr(C, packed(1))]
+pub(crate) struct GpuWeakPointer<T: ?Sized>(NonZeroU64, PhantomData<*const T>);
+
+/// SAFETY: GPU weak pointers are always safe to share between threads.
+unsafe impl<T: ?Sized> Send for GpuWeakPointer<T> {}
+unsafe impl<T: ?Sized> Sync for GpuWeakPointer<T> {}
+
+// Weak pointers can be copied/cloned regardless of their target type.
+impl<T: ?Sized> Copy for GpuWeakPointer<T> {}
+
+impl<T: ?Sized> Clone for GpuWeakPointer<T> {
+    fn clone(&self) -> Self {
+        *self
+    }
+}
+
+impl<T: ?Sized> GpuWeakPointer<T> {
+    /// Add an arbitrary offset to the pointer. This is not safe (from the GPU perspective), and
+    /// should only be used via the `inner_ptr` macro to get pointers to inner fields, hence we mark
+    /// it `unsafe` to discourage direct use.
+    // NOTE: The third argument is a type inference hack.
+    pub(crate) unsafe fn offset<U>(&self, off: usize, _: *const U) -> GpuWeakPointer<U> {
+        GpuWeakPointer::<U>(
+            NonZeroU64::new(self.0.get() + (off as u64)).unwrap(),
+            PhantomData,
+        )
+    }
+
+    /// Upgrade a weak pointer into a strong pointer. This is not considered safe from the GPU
+    /// perspective.
+    pub(crate) unsafe fn upgrade<'a>(&self) -> GpuPointer<'a, T> {
+        GpuPointer(self.0, PhantomData)
+    }
+}
+
+impl<T: ?Sized> Debug for GpuWeakPointer<T> {
+    fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
+        let val = self.0;
+        f.write_fmt(format_args!("{:#x} ({})", val, core::any::type_name::<T>()))
+    }
+}
+
+/// Take a pointer to a sub-field within a structure pointed to by a GpuWeakPointer.
+#[macro_export]
+macro_rules! inner_weak_ptr {
+    ($gpuva:expr, $($f:tt)*) => ({
+        // See inner_ptr()
+        fn uninit_from<T: GpuStruct>(_: GpuWeakPointer<T>) -> core::mem::MaybeUninit<T::Raw<'static>> {
+            core::mem::MaybeUninit::uninit()
+        }
+        let tmp = uninit_from($gpuva);
+        let outer = tmp.as_ptr();
+        // SAFETY: The pointer is valid and aligned, just not initialised; `addr_of` ensures that
+        // we don't actually read from `outer` (which would be UB) nor create an intermediate
+        // reference.
+        let p: *const _ = unsafe { core::ptr::addr_of!((*outer).$($f)*) };
+        let inner = p as *const u8;
+        // SAFETY: The two pointers are within the same allocation block.
+        let off = unsafe { inner.offset_from(outer as *const u8) };
+        // SAFETY: The resulting pointer is guaranteed to point to valid memory within the outer
+        // object.
+        unsafe { $gpuva.offset(off.try_into().unwrap(), p) }
+    })
+}
+
+/// Types that implement this trait represent a GPU structure from the CPU side.
+///
+/// The `Raw` type represents the actual raw structure definition on the GPU side.
+///
+/// Types implementing [`GpuStruct`] must have fields owning any objects (or strong references
+/// to them) that GPU pointers in the `Raw` structure point to. This mechanism is used to enforce
+/// lifetimes.
+pub(crate) trait GpuStruct: 'static {
+    /// The type of the GPU-side structure definition representing the firmware struct layout.
+    type Raw<'a>;
+}
+
+/// An instance of a GPU object in memory.
+///
+/// # Invariants
+/// `raw` must point to a valid mapping of the `T::Raw` type associated with the `alloc` allocation.
+/// `gpu_ptr` must be the GPU address of the same object.
+pub(crate) struct GpuObject<T: GpuStruct, U: Allocation<T>> {
+    raw: *mut T::Raw<'static>,
+    alloc: U,
+    gpu_ptr: GpuWeakPointer<T>,
+    inner: Box<T>,
+}
+
+impl<T: GpuStruct, U: Allocation<T>> GpuObject<T, U> {
+    /// Create a new GpuObject given an allocator and the inner data (a type implementing
+    /// GpuStruct).
+    ///
+    /// The caller passes a closure that constructs the `T::Raw` type given a reference to the
+    /// `GpuStruct`. This is the mechanism used to enforce lifetimes.
+    pub(crate) fn new(
+        alloc: U,
+        inner: T,
+        callback: impl for<'a> FnOnce(&'a T) -> T::Raw<'a>,
+    ) -> Result<Self> {
+        let size = mem::size_of::<T::Raw<'static>>();
+        if size > 0x1000 {
+            dev_crit!(
+                alloc.device(),
+                "Allocating {} of size {:#x}, with new, please use new_boxed!\n",
+                core::any::type_name::<T>(),
+                size
+            );
+        }
+        if alloc.size() < size {
+            return Err(ENOMEM);
+        }
+        let gpu_ptr =
+            GpuWeakPointer::<T>(NonZeroU64::new(alloc.gpu_ptr()).ok_or(EINVAL)?, PhantomData);
+        mod_dev_dbg!(
+            alloc.device(),
+            "Allocating {} @ {:#x}\n",
+            core::any::type_name::<T>(),
+            alloc.gpu_ptr()
+        );
+        let p = alloc.ptr().ok_or(EINVAL)?.as_ptr() as *mut T::Raw<'static>;
+        let mut raw = callback(&inner);
+        // SAFETY: `p` is guaranteed to be valid per the Allocation invariant, and the type is
+        // identical to the type of `raw` other than the lifetime.
+        unsafe { p.copy_from(&mut raw as *mut _ as *mut u8 as *mut _, 1) };
+        mem::forget(raw);
+        Ok(Self {
+            raw: p,
+            gpu_ptr,
+            alloc,
+            inner: Box::try_new(inner)?,
+        })
+    }
+
+    /// Create a new GpuObject given an allocator and the boxed inner data (a type implementing
+    /// GpuStruct).
+    ///
+    /// The caller passes a closure that initializes the `T::Raw` type given a reference to the
+    /// `GpuStruct` and a `MaybeUninit<T::Raw>`. This is intended to be used with the place!()
+    /// macro to avoid constructing the whole `T::Raw` object on the stack.
+    pub(crate) fn new_boxed(
+        alloc: U,
+        inner: Box<T>,
+        callback: impl for<'a> FnOnce(
+            &'a T,
+            &'a mut MaybeUninit<T::Raw<'a>>,
+        ) -> Result<&'a mut T::Raw<'a>>,
+    ) -> Result<Self> {
+        if alloc.size() < mem::size_of::<T::Raw<'static>>() {
+            return Err(ENOMEM);
+        }
+        let gpu_ptr =
+            GpuWeakPointer::<T>(NonZeroU64::new(alloc.gpu_ptr()).ok_or(EINVAL)?, PhantomData);
+        mod_dev_dbg!(
+            alloc.device(),
+            "Allocating {} @ {:#x}\n",
+            core::any::type_name::<T>(),
+            alloc.gpu_ptr()
+        );
+        let p = alloc.ptr().ok_or(EINVAL)?.as_ptr() as *mut MaybeUninit<T::Raw<'_>>;
+        // SAFETY: `p` is guaranteed to be valid per the Allocation invariant.
+        let raw = callback(&inner, unsafe { &mut *p })?;
+        if p as *mut T::Raw<'_> != raw as *mut _ {
+            dev_err!(
+                alloc.device(),
+                "Allocation callback returned a mismatched reference ({})\n",
+                core::any::type_name::<T>(),
+            );
+            return Err(EINVAL);
+        }
+        Ok(Self {
+            raw: p as *mut u8 as *mut T::Raw<'static>,
+            gpu_ptr,
+            alloc,
+            inner,
+        })
+    }
+
+    /// Create a new GpuObject given an allocator and the inner data (a type implementing
+    /// GpuStruct).
+    ///
+    /// The caller passes a closure that initializes the `T::Raw` type given a reference to the
+    /// `GpuStruct` and a `MaybeUninit<T::Raw>`. This is intended to be used with the place!()
+    /// macro to avoid constructing the whole `T::Raw` object on the stack.
+    pub(crate) fn new_inplace(
+        alloc: U,
+        inner: T,
+        callback: impl for<'a> FnOnce(
+            &'a T,
+            &'a mut MaybeUninit<T::Raw<'a>>,
+        ) -> Result<&'a mut T::Raw<'a>>,
+    ) -> Result<Self> {
+        GpuObject::<T, U>::new_boxed(alloc, Box::try_new(inner)?, callback)
+    }
+
+    /// Create a new GpuObject given an allocator, with callback-based initialization.
+    ///
+    /// This is used when the construction of the `T` type requires knowing the GPU VA address of
+    /// the structure that is being constructed ahead of time. The first callback constructs a
+    /// `Box<T>` given the pointer to the about-to-be-initialized GPU structure, and the second
+    /// callback initializes that structure as in `new_boxed`.
+    pub(crate) fn new_prealloc(
+        alloc: U,
+        inner_cb: impl FnOnce(GpuWeakPointer<T>) -> Result<Box<T>>,
+        raw_cb: impl for<'a> FnOnce(
+            &'a T,
+            &'a mut MaybeUninit<T::Raw<'a>>,
+        ) -> Result<&'a mut T::Raw<'a>>,
+    ) -> Result<Self> {
+        if alloc.size() < mem::size_of::<T::Raw<'static>>() {
+            return Err(ENOMEM);
+        }
+        let gpu_ptr =
+            GpuWeakPointer::<T>(NonZeroU64::new(alloc.gpu_ptr()).ok_or(EINVAL)?, PhantomData);
+        mod_dev_dbg!(
+            alloc.device(),
+            "Allocating {} @ {:#x}\n",
+            core::any::type_name::<T>(),
+            alloc.gpu_ptr()
+        );
+        let inner = inner_cb(gpu_ptr)?;
+        let p = alloc.ptr().ok_or(EINVAL)?.as_ptr() as *mut MaybeUninit<T::Raw<'_>>;
+        // SAFETY: `p` is guaranteed to be valid per the Allocation invariant.
+        let raw = raw_cb(&*inner, unsafe { &mut *p })?;
+        if p as *mut T::Raw<'_> != raw as *mut _ {
+            dev_err!(
+                alloc.device(),
+                "Allocation callback returned a mismatched reference ({})\n",
+                core::any::type_name::<T>(),
+            );
+            return Err(EINVAL);
+        }
+        Ok(Self {
+            raw: p as *mut u8 as *mut T::Raw<'static>,
+            gpu_ptr,
+            alloc,
+            inner,
+        })
+    }
+
+    /// Returns the GPU VA of this object (as a raw [`NonZeroU64`])
+    pub(crate) fn gpu_va(&self) -> NonZeroU64 {
+        self.gpu_ptr.0
+    }
+
+    /// Returns a strong GPU pointer to this object, with a lifetime.
+    pub(crate) fn gpu_pointer(&self) -> GpuPointer<'_, T> {
+        GpuPointer(self.gpu_ptr.0, PhantomData)
+    }
+
+    /// Returns a weak GPU pointer to this object, with no lifetime.
+    pub(crate) fn weak_pointer(&self) -> GpuWeakPointer<T> {
+        GpuWeakPointer(self.gpu_ptr.0, PhantomData)
+    }
+
+    /// Perform a mutation to the inner `Raw` data given a user-supplied callback.
+    ///
+    /// The callback gets a mutable reference to the `GpuStruct` type.
+    pub(crate) fn with_mut<RetVal>(
+        &mut self,
+        callback: impl for<'a> FnOnce(&'a mut <T as GpuStruct>::Raw<'a>, &'a mut T) -> RetVal,
+    ) -> RetVal {
+        // SAFETY: `self.raw` is valid per the type invariant, and the second half is just
+        // converting lifetimes.
+        unsafe { callback(&mut *self.raw, &mut *(&mut *self.inner as *mut _)) }
+    }
+
+    /// Access the inner `Raw` data given a user-supplied callback.
+    ///
+    /// The callback gets a reference to the `GpuStruct` type.
+    pub(crate) fn with<RetVal>(
+        &self,
+        callback: impl for<'a> FnOnce(&'a <T as GpuStruct>::Raw<'a>, &'a T) -> RetVal,
+    ) -> RetVal {
+        // SAFETY: `self.raw` is valid per the type invariant, and the second half is just
+        // converting lifetimes.
+        unsafe { callback(&*self.raw, &*(&*self.inner as *const _)) }
+    }
+}
+
+impl<T: GpuStruct, U: Allocation<T>> Deref for GpuObject<T, U> {
+    type Target = T;
+
+    fn deref(&self) -> &Self::Target {
+        &self.inner
+    }
+}
+
+impl<T: GpuStruct, U: Allocation<T>> DerefMut for GpuObject<T, U> {
+    fn deref_mut(&mut self) -> &mut Self::Target {
+        &mut self.inner
+    }
+}
+
+impl<T: GpuStruct + Debug, U: Allocation<T>> Debug for GpuObject<T, U>
+where
+    <T as GpuStruct>::Raw<'static>: Debug,
+{
+    fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
+        f.debug_struct(core::any::type_name::<T>())
+            // SAFETY: `self.raw` is valid per the type invariant.
+            .field("raw", &format_args!("{:#X?}", unsafe { &*self.raw }))
+            .field("inner", &format_args!("{:#X?}", &self.inner))
+            .field("alloc", &format_args!("{:?}", &self.alloc))
+            .finish()
+    }
+}
+
+impl<T: GpuStruct + Default, U: Allocation<T>> GpuObject<T, U>
+where
+    for<'a> <T as GpuStruct>::Raw<'a>: Default + Zeroed,
+{
+    /// Create a new GpuObject with default data. `T` must implement `Default` and `T::Raw` must
+    /// implement `Zeroed`, since the GPU-side memory is initialized by zeroing.
+    pub(crate) fn new_default(alloc: U) -> Result<Self> {
+        GpuObject::<T, U>::new_inplace(alloc, Default::default(), |_inner, raw| {
+            // SAFETY: `raw` is valid here, and `T::Raw` implements `Zeroed`.
+            Ok(unsafe {
+                ptr::write_bytes(raw, 0, 1);
+                (*raw).assume_init_mut()
+            })
+        })
+    }
+}
+
+impl<T: GpuStruct, U: Allocation<T>> Drop for GpuObject<T, U> {
+    fn drop(&mut self) {
+        mod_dev_dbg!(
+            self.alloc.device(),
+            "Dropping {} @ {:?}\n",
+            core::any::type_name::<T>(),
+            self.gpu_pointer()
+        );
+    }
+}
+
+// SAFETY: GpuObjects are Send as long as the GpuStruct itself is Send
+unsafe impl<T: GpuStruct + Send, U: Allocation<T>> Send for GpuObject<T, U> {}
+// SAFETY: GpuObjects are Send as long as the GpuStruct itself is Send
+unsafe impl<T: GpuStruct + Sync, U: Allocation<T>> Sync for GpuObject<T, U> {}
+
+/// Trait used to erase the type of a GpuObject, used when we need to keep a list of heterogenous
+/// objects around.
+pub(crate) trait OpaqueGpuObject: Send + Sync {
+    fn gpu_va(&self) -> NonZeroU64;
+}
+
+impl<T: GpuStruct + Sync + Send, U: Allocation<T>> OpaqueGpuObject for GpuObject<T, U> {
+    fn gpu_va(&self) -> NonZeroU64 {
+        Self::gpu_va(self)
+    }
+}
+
+/// An array of raw GPU objects that is only accessible to the GPU (no CPU-side mapping required).
+///
+/// This must necessarily be uninitialized as far as the GPU is concerned, so it cannot be used
+/// when initialization is required.
+///
+/// # Invariants
+///
+/// `alloc` is valid and at least as large as `len` times the size of one `T`.
+/// `gpu_ptr` is valid and points to the allocation start.
+pub(crate) struct GpuOnlyArray<T, U: Allocation<T>> {
+    len: usize,
+    alloc: U,
+    gpu_ptr: NonZeroU64,
+    _p: PhantomData<T>,
+}
+
+impl<T, U: Allocation<T>> GpuOnlyArray<T, U> {
+    /// Allocate a new GPU-only array with the given length.
+    pub(crate) fn new(alloc: U, count: usize) -> Result<GpuOnlyArray<T, U>> {
+        let bytes = count * mem::size_of::<T>();
+        let gpu_ptr = NonZeroU64::new(alloc.gpu_ptr()).ok_or(EINVAL)?;
+        if alloc.size() < bytes {
+            return Err(ENOMEM);
+        }
+        Ok(Self {
+            len: count,
+            alloc,
+            gpu_ptr,
+            _p: PhantomData,
+        })
+    }
+
+    /// Returns the GPU VA of this arraw (as a raw [`NonZeroU64`])
+    pub(crate) fn gpu_va(&self) -> NonZeroU64 {
+        self.gpu_ptr
+    }
+
+    /// Returns a strong GPU pointer to this array, with a lifetime.
+    pub(crate) fn gpu_pointer(&self) -> GpuPointer<'_, &'_ [T]> {
+        GpuPointer(self.gpu_ptr, PhantomData)
+    }
+
+    /// Returns a weak GPU pointer to this array, with no lifetime.
+    pub(crate) fn weak_pointer(&self) -> GpuWeakPointer<[T]> {
+        GpuWeakPointer(self.gpu_ptr, PhantomData)
+    }
+
+    /// Returns a pointer to an offset within the array (as a subslice).
+    pub(crate) fn gpu_offset_pointer(&self, offset: usize) -> GpuPointer<'_, &'_ [T]> {
+        if offset > self.len {
+            panic!("Index {} out of bounds (len: {})", offset, self.len);
+        }
+        GpuPointer(
+            NonZeroU64::new(self.gpu_ptr.get() + (offset * mem::size_of::<T>()) as u64).unwrap(),
+            PhantomData,
+        )
+    }
+
+    /* Not used yet
+    /// Returns a weak pointer to an offset within the array (as a subslice).
+    pub(crate) fn weak_offset_pointer(&self, offset: usize) -> GpuWeakPointer<[T]> {
+        if offset > self.len {
+            panic!("Index {} out of bounds (len: {})", offset, self.len);
+        }
+        GpuWeakPointer(
+            NonZeroU64::new(self.gpu_ptr.get() + (offset * mem::size_of::<T>()) as u64).unwrap(),
+            PhantomData,
+        )
+    }
+
+    /// Returns a pointer to an element within the array.
+    pub(crate) fn gpu_item_pointer(&self, index: usize) -> GpuPointer<'_, &'_ T> {
+        if index >= self.len {
+            panic!("Index {} out of bounds (len: {})", index, self.len);
+        }
+        GpuPointer(
+            NonZeroU64::new(self.gpu_ptr.get() + (index * mem::size_of::<T>()) as u64).unwrap(),
+            PhantomData,
+        )
+    }
+    */
+
+    /// Returns a weak pointer to an element within the array.
+    pub(crate) fn weak_item_pointer(&self, index: usize) -> GpuWeakPointer<T> {
+        if index >= self.len {
+            panic!("Index {} out of bounds (len: {})", index, self.len);
+        }
+        GpuWeakPointer(
+            NonZeroU64::new(self.gpu_ptr.get() + (index * mem::size_of::<T>()) as u64).unwrap(),
+            PhantomData,
+        )
+    }
+
+    /// Returns the length of the array.
+    pub(crate) fn len(&self) -> usize {
+        self.len
+    }
+}
+
+impl<T: Debug, U: Allocation<T>> Debug for GpuOnlyArray<T, U> {
+    fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
+        f.debug_struct(core::any::type_name::<T>())
+            .field("len", &format_args!("{:#X?}", self.len()))
+            .finish()
+    }
+}
+
+impl<T, U: Allocation<T>> Drop for GpuOnlyArray<T, U> {
+    fn drop(&mut self) {
+        mod_dev_dbg!(
+            self.alloc.device(),
+            "Dropping {} @ {:?}\n",
+            core::any::type_name::<T>(),
+            self.gpu_pointer()
+        );
+    }
+}
+
+/// An array of raw GPU objects that is also CPU-accessible.
+///
+/// # Invariants
+///
+/// `raw` is valid and points to the CPU-side view of the array (which must have one).
+pub(crate) struct GpuArray<T, U: Allocation<T>> {
+    raw: *mut T,
+    array: GpuOnlyArray<T, U>,
+}
+
+/* Not used yet
+impl<T: Copy, U: Allocation<T>> GpuArray<T, U> {
+    /// Allocate a new GPU array, copying the contents from a slice.
+    pub(crate) fn new(alloc: U, data: &[T]) -> Result<GpuArray<T, U>> {
+        let p = alloc.ptr().ok_or(EINVAL)?.as_ptr();
+        let inner = GpuOnlyArray::new(alloc, data.len())?;
+        // SAFETY: `p` is valid per the Allocation type invariant, and GpuOnlyArray guarantees
+        // that its size is at least as large as `data.len()`.
+        unsafe { ptr::copy(data.as_ptr(), p, data.len()) };
+        Ok(Self {
+            raw: p,
+            array: inner,
+        })
+    }
+}
+*/
+
+impl<T: Default, U: Allocation<T>> GpuArray<T, U> {
+    /// Allocate a new GPU array, initializing each element to its default.
+    pub(crate) fn empty(alloc: U, count: usize) -> Result<GpuArray<T, U>> {
+        let p = alloc.ptr().ok_or(EINVAL)?.as_ptr() as *mut T;
+        let inner = GpuOnlyArray::new(alloc, count)?;
+        let mut pi = p;
+        for _i in 0..count {
+            // SAFETY: `pi` is valid per the Allocation type invariant, and GpuOnlyArray guarantees
+            // that it can never iterate beyond the buffer length.
+            unsafe {
+                pi.write(Default::default());
+                pi = pi.add(1);
+            }
+        }
+        Ok(Self {
+            raw: p,
+            array: inner,
+        })
+    }
+}
+
+impl<T, U: Allocation<T>> GpuArray<T, U> {
+    /// Get a slice view of the array contents.
+    pub(crate) fn as_slice(&self) -> &[T] {
+        // SAFETY: self.raw / self.len are valid per the type invariant
+        unsafe { slice::from_raw_parts(self.raw, self.len) }
+    }
+
+    /// Get a mutable slice view of the array contents.
+    pub(crate) fn as_mut_slice(&mut self) -> &mut [T] {
+        // SAFETY: self.raw / self.len are valid per the type invariant
+        unsafe { slice::from_raw_parts_mut(self.raw, self.len) }
+    }
+}
+
+impl<T, U: Allocation<T>> Deref for GpuArray<T, U> {
+    type Target = GpuOnlyArray<T, U>;
+
+    fn deref(&self) -> &GpuOnlyArray<T, U> {
+        &self.array
+    }
+}
+
+impl<T, U: Allocation<T>> Index<usize> for GpuArray<T, U> {
+    type Output = T;
+
+    fn index(&self, index: usize) -> &T {
+        if index >= self.len {
+            panic!("Index {} out of bounds (len: {})", index, self.len);
+        }
+        // SAFETY: This is bounds checked above
+        unsafe { &*(self.raw.add(index)) }
+    }
+}
+
+impl<T, U: Allocation<T>> IndexMut<usize> for GpuArray<T, U> {
+    fn index_mut(&mut self, index: usize) -> &mut T {
+        if index >= self.len {
+            panic!("Index {} out of bounds (len: {})", index, self.len);
+        }
+        // SAFETY: This is bounds checked above
+        unsafe { &mut *(self.raw.add(index)) }
+    }
+}
+
+// SAFETY: GpuArray are Send as long as the contained type itself is Send
+unsafe impl<T: Send, U: Allocation<T>> Send for GpuArray<T, U> {}
+// SAFETY: GpuArray are Sync as long as the contained type itself is Sync
+unsafe impl<T: Sync, U: Allocation<T>> Sync for GpuArray<T, U> {}
+
+impl<T: Debug, U: Allocation<T>> Debug for GpuArray<T, U> {
+    fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
+        f.debug_struct(core::any::type_name::<T>())
+            .field("array", &format_args!("{:#X?}", self.as_slice()))
+            .finish()
+    }
+}
diff --git a/drivers/gpu/drm/asahi/place.rs b/drivers/gpu/drm/asahi/place.rs
new file mode 100644
index 000000000000..40c51f4fab8d
--- /dev/null
+++ b/drivers/gpu/drm/asahi/place.rs
@@ -0,0 +1,343 @@
+// SPDX-License-Identifier: Apache-2.0 OR MIT
+
+//! "Placement new" macro
+//!
+//! This cursed abomination of a declarative macro is used to emulate a "placement new" feature,
+//! which allows initializing objects directly in a user-provided memory region without first
+//! going through the stack.
+//!
+//! This driver needs to manage several large GPU objects of a fixed layout. Linux kernel stacks are
+//! very small, so it is impossible to create these objects on the stack. While the compiler can
+//! sometimes optimize away the stack copy and directly instantiate in target memory, this is not
+//! guaranteed and not reliable. Therefore, we need some mechanism to ergonomically initialize
+//! complex structures directly in a pre-allocated piece of memory.
+//!
+//! This issue also affects some driver-internal structs which are large/complex enough to overflow
+//! the stack. While this can be solved by breaking them up into pieces and using `Box` more
+//! liberally, this has performance implications and still isn't very nice. This macro can also be
+//! used to solve this issue.
+//!
+//! # Further reading
+//! https://github.com/rust-lang/rust/issues/27779#issuecomment-378416911
+//! https://internals.rust-lang.org/t/removal-of-all-unstable-placement-features/7223
+
+/// Initialize a `MaybeUninit` in-place, without constructing the value on the stack first.
+///
+/// This macro is analogous to `MaybeUninit::write()`. In other words,
+/// `place!(foo, bar)` is equivalent to `MaybeUninit::write(foo, bar)`, except that `bar` is not
+/// constructed first, but rather its fields (if it is a structure constructor) are copied one by
+/// one into the correct location in the `MaybeUninit`.
+///
+/// The macro supports most Rust initialization syntax including type paths, generic arguments,
+/// and nested structures. Nested structures are themselves initialized in-place field by field.
+/// `..Default::default()` is supported, but this macro converts it to `..Zeroed::zeroed()`, as it
+/// initializes those structs by zero-initializing the underlying memory. Usage of
+/// `..Default::default()` with a type not implementing `Zeroed` will result in a compile error.
+///
+/// Usage:
+/// ```
+/// let mut buf = MaybeUninit::uninit();
+/// let mut_ref = place!(&mut buf, MyStruct {
+///     b: true,
+///     s: String::from("works"),
+///     i: str::parse::<i32>("123").unwrap(),
+///     v: vec![String::from("works")],
+///     x: foo::MyOtherCoolStruct {
+///         a: false,
+///         b: String::from("Hello, world!"),
+///     },
+///     y: foo::MyOtherCoolStruct {
+///         a: false,
+///         b: String::from("Hello, world!"),
+///     },
+///     z: foo::MyCoolGenericStruct::<bool, String> {
+///         a: false,
+///         b: String::from("Hello, world!"),
+///     },
+/// };
+/// // `mut_ref` is now a mutable reference to the `buf`, which is now safely initialized.
+/// ```
+///
+/// Based on https://crates.io/crates/place by DianaNites, with contributions by Joshua Barretto.
+#[macro_export]
+macro_rules! place {
+    // Top-level struct
+    (@STRUCT $ptr:ident, _TOP, $typ:path, {$($typ_init:tt)*} { $($fields:tt)* }) => {{
+        place!(@STRUCT_ZERO $ptr, {$($typ_init)*} { $($fields)* });
+        place!(@STRUCT_CHECK $ptr, {$($typ_init)*} { $($fields)* } {
+            place!(@FIELDS $ptr, $($fields)*);
+        });
+    }};
+    // Nested structure
+    (@STRUCT $ptr:ident, $f_struct:ident, $typ:path, {$($typ_init:tt)*} { $($fields:tt)* }) => {{
+        use core::ptr::addr_of_mut;
+        let buf = unsafe { addr_of_mut!((*$ptr).$f_struct) };
+        place!(@STRUCT_ZERO buf, {$($typ_init)*} { $($fields)* });
+        place!(@STRUCT_CHECK $ptr, {$($typ_init)*} { $($fields)* } {
+            place!(@FIELDS buf, $($fields)*);
+        });
+    }};
+
+    // Zero-initialize structure if the initializer ends in ..default::Default()
+    (@STRUCT_ZERO $ptr:ident, {$($typ_init:tt)*} { $($f:ident $(: $v:expr)?),* $(,)? }) => {};
+    (@STRUCT_ZERO $ptr:ident, {$($typ_init:tt)*} { $($($f:ident $(: $v:expr)?),*,)? ..Default::default() }) => {{
+        // Check that the structure actually implements Zeroed
+        const _: () = {
+            fn _check_default() {
+                let _ = $($typ_init)* {
+                    ..Zeroed::zeroed()
+                };
+            }
+        };
+        use core::ptr;
+        unsafe { ptr::write_bytes($ptr, 0, 1) };
+
+    }};
+
+    // Check that all fields are specified
+    (@STRUCT_CHECK $ptr:ident, {$($typ_init:tt)*} { $($($f:ident $(: $v:expr)?),*,)? ..Default::default() } {$($body:tt)*}) => {
+        if false {
+            #[allow(clippy::redundant_field_names)]
+            let _x = $($typ_init)* {
+                $($(
+                    $f $(: $v)?
+                ),*
+                ,)?
+                ..Zeroed::zeroed()
+            };
+        } else {
+            {$($body)*}
+        }
+    };
+    (@STRUCT_CHECK $ptr:ident, {$($typ_init:tt)*} { $($f:ident $(: $v:expr)?),* $(,)? } {$($body:tt)*}) => {
+        if false {
+            #[allow(clippy::redundant_field_names)]
+            let _x = $($typ_init)* {
+                $(
+                    $f $(: $v)?
+                ),*
+            };
+        } else {
+            {$($body)*}
+        }
+    };
+    // Top-level scalar
+    (@SCALAR $ptr:ident, _TOP, $val:expr) => {
+        let tmp = $val;
+        unsafe { $ptr.write(tmp); }
+    };
+    // Regular field
+    (@SCALAR $ptr:ident, $f:ident, $val:expr) => {{
+        use core::ptr::addr_of_mut;
+        let tmp = $val;
+        unsafe { addr_of_mut!((*$ptr).$f).write(tmp); }
+    }};
+    // Type-like name followed by braces is a nested structure
+    (@PARTIAL $ptr:ident, $f:ident, {$($head:tt)*}, {{ $($fields:tt)* } $($tail:tt)*}) => {
+        place!(@STRUCT $ptr, $f, $($head)*, {$($head)*} { $($fields)* });
+        place!(@FIELDS $ptr $($tail)*)
+    };
+    // Type-like name followed by ::ident, append to head
+    (@PARTIAL $ptr:ident, $f:ident, {$($head:tt)*}, {::$id:ident $($tail:tt)*}) => {
+        place!(@PARTIAL $ptr, $f, {$($head)* :: $id}, {$($tail)*});
+    };
+    // Type-like name followed by ::<args>, append to head
+    (@PARTIAL $ptr:ident, $f:ident, {$($head:tt)*}, {::<$($gen:ty),*> $($tail:tt)*}) => {
+        place!(@PARTIAL $ptr, $f, {$($head)* :: <$($gen),*>}, {$($tail)*});
+    };
+    // Type-like name followed by ::<'lifetime>, append to head
+    (@PARTIAL $ptr:ident, $f:ident, {$($head:tt)*}, {::<$li:lifetime> $($tail:tt)*}) => {
+        place!(@PARTIAL $ptr, $f, {$($head)* :: <$li>}, {$($tail)*});
+    };
+    // Anything else, parse it as an expression
+    (@PARTIAL $ptr:ident, $f:ident, {$($head:tt)*}, {$($tail:tt)*}) => {
+        place!(@EXPR $ptr, $f, $($head)* $($tail)*)
+    };
+    // Expression followed by more fields
+    (@EXPR $ptr:ident, $f:ident, $val:expr, $($tail:tt)*) => {
+        place!(@SCALAR $ptr, $f, $val);
+        place!(@FIELDS $ptr, $($tail)*)
+    };
+    // Last field expression, without a trailing comma
+    (@EXPR $ptr:ident, $f:ident, $val:expr) => {
+        place!(@SCALAR $ptr, $f, $val);
+    };
+    // Field with a value starting with an ident, start incremental type parsing
+    (@FIELDS $ptr:ident, $f:ident : $id:ident $($tail:tt)*) => {
+        place!(@PARTIAL $ptr, $f, {$id}, {$($tail)*});
+    };
+    // Same, but starting with ::ident
+    (@FIELDS $ptr:ident, $f:ident : ::$id:ident $($tail:tt)*) => {
+        place!(@PARTIAL $ptr, $f, {::$id}, {$($tail)*});
+    };
+    // Otherwise, parse it as an expression
+    (@FIELDS $ptr:ident, $f:ident : $($tail:tt)*) => {
+        place!(@EXPR $ptr, $f, $($tail)*)
+    };
+    // Default terminating case
+    (@FIELDS $ptr:ident, ..Default::default() ) => {};
+    // Terminating case
+    (@FIELDS $ptr:ident $(,)? ) => {};
+    (
+        $buf:expr,
+        $($val:tt)*
+    ) => {{
+        use core::mem::MaybeUninit;
+        // Ensures types are correct
+        let obj: &mut MaybeUninit<_> = $buf;
+        let top_ptr = obj.as_mut_ptr();
+        place!(@FIELDS top_ptr, _TOP: $($val)*);
+        // SAFETY: All fields have been initialized above
+        // The compiler ensures that all fields were used, all types were correct,
+        // and that size and alignment are correct.
+        unsafe { obj.assume_init_mut() }
+    }};
+}
+
+/// Helper macro to get the struct type part of a struct initialization expression.
+#[macro_export]
+#[doc(hidden)]
+macro_rules! get_type {
+    ($t:ty { $($val:tt)* }) => {
+        $t
+    };
+}
+
+/// Like `Box::try_new(...)`, but with in-place initialization.
+#[macro_export]
+macro_rules! box_in_place {
+    ($($val:tt)*) => {{
+        use $crate::place;
+        let b = Box::<$crate::get_type!($($val)*)>::try_new_uninit();
+        match b {
+            Ok(mut p) => {
+                place!((&mut *p), $($val)*);
+                Ok(unsafe { p.assume_init() })
+            }
+            Err(e) => Err(e)
+        }
+    }};
+}
+
+// TODO: figure out how to make this run
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use core::mem::MaybeUninit;
+
+    #[derive(Debug, PartialEq)]
+    struct MyCoolStruct {
+        b: bool,
+        s: String,
+        i: i32,
+        v: Vec<String>,
+        x: MyOtherCoolStruct,
+        y: MyOtherCoolStruct,
+        z: foo::MyCoolGenericStruct<bool, String>,
+    }
+
+    #[derive(Debug, PartialEq)]
+    struct MyDefaultStruct {
+        b: bool,
+        i: i32,
+        j: i16,
+    }
+    default_zeroed!(MyDefaultStruct);
+
+    mod foo {
+        #[derive(Debug, PartialEq)]
+        pub struct MyOtherCoolStruct {
+            pub a: bool,
+            pub b: String,
+        }
+        #[derive(Debug, PartialEq)]
+        pub struct MyCoolGenericStruct<T, U> {
+            pub a: T,
+            pub b: U,
+        }
+    }
+
+    use foo::MyOtherCoolStruct;
+
+    #[test]
+    fn test_initialized() {
+        let mut buf: MaybeUninit<MyCoolStruct> = MaybeUninit::uninit();
+
+        let x: &mut MyCoolStruct = place!(
+            &mut buf,
+            MyCoolStruct {
+                b: true,
+                s: String::from("works"),
+                i: str::parse::<i32>("123").unwrap(),
+                v: vec![String::from("works")],
+                x: MyOtherCoolStruct {
+                    a: false,
+                    b: String::from("Hello, world!"),
+                },
+                y: foo::MyOtherCoolStruct {
+                    a: false,
+                    b: String::from("Hello, world!"),
+                },
+                z: foo::MyCoolGenericStruct::<bool, String> {
+                    a: false,
+                    b: String::from("Hello, world!"),
+                }
+            }
+        );
+        //dbg!(x);
+
+        assert_eq!(
+            x,
+            &MyCoolStruct {
+                b: true,
+                s: String::from("works"),
+                i: str::parse::<i32>("123").unwrap(),
+                v: vec![String::from("works")],
+                x: foo::MyOtherCoolStruct {
+                    a: false,
+                    b: String::from("Hello, world!"),
+                },
+                y: foo::MyOtherCoolStruct {
+                    a: false,
+                    b: String::from("Hello, world!"),
+                },
+                z: foo::MyCoolGenericStruct::<bool, String> {
+                    a: false,
+                    b: String::from("Hello, world!"),
+                },
+            },
+        );
+    }
+
+    #[test]
+    fn test_default() {
+        let mut buf: MaybeUninit<MyDefaultStruct> = MaybeUninit::uninit();
+
+        let x: &mut MyDefaultStruct = place!(
+            &mut buf,
+            MyDefaultStruct {
+                b: true,
+                i: 1,
+                ..Default::default()
+            }
+        );
+
+        assert_eq!(
+            x,
+            &MyDefaultStruct {
+                b: true,
+                i: 1,
+                j: 0,
+            },
+        );
+    }
+
+    #[test]
+    fn test_scalar() {
+        let mut buf: MaybeUninit<u32> = MaybeUninit::uninit();
+
+        let x: &mut u32 = place!(&mut buf, 1234);
+
+        assert_eq!(x, &mut 1234u32);
+    }
+}
diff --git a/drivers/gpu/drm/asahi/queue/common.rs b/drivers/gpu/drm/asahi/queue/common.rs
new file mode 100644
index 000000000000..127b4ccc6eca
--- /dev/null
+++ b/drivers/gpu/drm/asahi/queue/common.rs
@@ -0,0 +1,52 @@
+// SPDX-License-Identifier: GPL-2.0-only OR MIT
+
+//! Common queue functionality.
+//!
+//! Shared helpers used by the submission logic for multiple command types.
+
+use crate::fw::microseq;
+use crate::fw::types::*;
+
+use kernel::bindings;
+use kernel::io_buffer::IoBufferReader;
+use kernel::prelude::*;
+use kernel::user_ptr::UserSlicePtr;
+
+use core::mem::MaybeUninit;
+
+pub(super) fn build_attachments(pointer: u64, count: u32) -> Result<microseq::Attachments> {
+    if count as usize > microseq::MAX_ATTACHMENTS {
+        return Err(EINVAL);
+    }
+
+    const STRIDE: usize = core::mem::size_of::<bindings::drm_asahi_attachment>();
+    let size = STRIDE * count as usize;
+
+    // SAFETY: We only read this once, so there are no TOCTOU issues.
+    let mut reader = unsafe { UserSlicePtr::new(pointer as usize as *mut _, size).reader() };
+
+    let mut attachments: microseq::Attachments = Default::default();
+
+    for i in 0..count {
+        let mut att: MaybeUninit<bindings::drm_asahi_attachment> = MaybeUninit::uninit();
+
+        // SAFETY: The size of `att` is STRIDE
+        unsafe { reader.read_raw(att.as_mut_ptr() as *mut u8, STRIDE)? };
+
+        // SAFETY: All bit patterns in the struct are valid
+        let att = unsafe { att.assume_init() };
+
+        let cache_lines = (att.size + 127) >> 7;
+        let order = 1;
+        attachments.list[i as usize] = microseq::Attachment {
+            address: U64(att.pointer),
+            size: cache_lines,
+            unk_c: 0x17,
+            unk_e: order,
+        };
+
+        attachments.count += 1;
+    }
+
+    Ok(attachments)
+}
diff --git a/drivers/gpu/drm/asahi/queue/compute.rs b/drivers/gpu/drm/asahi/queue/compute.rs
new file mode 100644
index 000000000000..6590382c75af
--- /dev/null
+++ b/drivers/gpu/drm/asahi/queue/compute.rs
@@ -0,0 +1,371 @@
+// SPDX-License-Identifier: GPL-2.0-only OR MIT
+#![allow(clippy::unusual_byte_groupings)]
+
+//! Compute work queue.
+//!
+//! A compute queue consists of one underlying WorkQueue.
+//! This module is in charge of creating all of the firmware structures required to submit compute
+//! work to the GPU, based on the userspace command buffer.
+
+use super::common;
+use crate::alloc::Allocator;
+use crate::debug::*;
+use crate::fw::types::*;
+use crate::gpu::GpuManager;
+use crate::{box_in_place, inner_ptr, inner_weak_ptr, place};
+use crate::{fw, gpu, microseq};
+use core::mem::MaybeUninit;
+use core::sync::atomic::Ordering;
+use kernel::bindings;
+use kernel::dma_fence::RawDmaFence;
+use kernel::drm::sched::Job;
+use kernel::io_buffer::IoBufferReader;
+use kernel::prelude::*;
+use kernel::sync::Arc;
+use kernel::user_ptr::UserSlicePtr;
+
+const DEBUG_CLASS: DebugFlags = DebugFlags::Compute;
+
+#[versions(AGX)]
+impl super::Queue::ver {
+    /// Submit work to a compute queue.
+    pub(super) fn submit_compute(
+        &self,
+        job: &mut Job<super::QueueJob::ver>,
+        cmd: &bindings::drm_asahi_command,
+        result_writer: Option<super::ResultWriter>,
+        id: u64,
+        flush_stamps: bool,
+    ) -> Result {
+        if cmd.cmd_type != bindings::drm_asahi_cmd_type_DRM_ASAHI_CMD_COMPUTE {
+            return Err(EINVAL);
+        }
+
+        let dev = self.dev.data();
+        let gpu = match dev.gpu.as_any().downcast_ref::<gpu::GpuManager::ver>() {
+            Some(gpu) => gpu,
+            None => {
+                dev_crit!(self.dev, "GpuManager mismatched with Queue!\n");
+                return Err(EIO);
+            }
+        };
+
+        let mut alloc = gpu.alloc();
+        let kalloc = &mut *alloc;
+
+        mod_dev_dbg!(self.dev, "[Submission {}] Compute!\n", id);
+
+        let mut cmdbuf_reader = unsafe {
+            UserSlicePtr::new(
+                cmd.cmd_buffer as usize as *mut _,
+                core::mem::size_of::<bindings::drm_asahi_cmd_compute>(),
+            )
+            .reader()
+        };
+
+        let mut cmdbuf: MaybeUninit<bindings::drm_asahi_cmd_compute> = MaybeUninit::uninit();
+        unsafe {
+            cmdbuf_reader.read_raw(
+                cmdbuf.as_mut_ptr() as *mut u8,
+                core::mem::size_of::<bindings::drm_asahi_cmd_compute>(),
+            )?;
+        }
+        let cmdbuf = unsafe { cmdbuf.assume_init() };
+
+        if cmdbuf.flags != 0 {
+            return Err(EINVAL);
+        }
+
+        // This sequence number increases per new client/VM? assigned to some slot,
+        // but it's unclear *which* slot...
+        let slot_client_seq: u8 = (self.id & 0xff) as u8;
+
+        let vm_bind = job.vm_bind.clone();
+
+        mod_dev_dbg!(
+            self.dev,
+            "[Submission {}] VM slot = {}\n",
+            id,
+            vm_bind.slot()
+        );
+
+        let notifier = self.notifier.clone();
+
+        let fence = job.fence.clone();
+        let comp_job = job.get_comp()?;
+        let ev_comp = comp_job.event_info();
+
+        // TODO: Is this the same on all GPUs? Is this really for preemption?
+        let preempt_size = 0x7fa0;
+        let preempt2_off = 0x7f80;
+        let preempt3_off = 0x7f88;
+        let preempt4_off = 0x7f90;
+        let preempt5_off = 0x7f98;
+
+        let preempt_buf = self.ualloc.lock().array_empty(preempt_size)?;
+
+        let mut seq_buf = self.ualloc.lock().array_empty(0x800)?;
+        for i in 1..0x400 {
+            seq_buf[i] = (i + 1) as u64;
+        }
+
+        mod_dev_dbg!(
+            self.dev,
+            "[Submission {}] Event #{} {:#x?} -> {:#x?}\n",
+            id,
+            ev_comp.slot,
+            ev_comp.value,
+            ev_comp.value.next(),
+        );
+
+        let timestamps = Arc::try_new(kalloc.shared.new_default::<fw::job::JobTimestamps>()?)?;
+
+        let uuid = cmdbuf.cmd_id;
+
+        let unk3 = debug_enabled(debug::DebugFlags::Debug3);
+
+        mod_dev_dbg!(self.dev, "[Submission {}] UUID = {:#x?}\n", id, uuid);
+
+        // TODO: check
+        #[ver(V >= V13_0B4)]
+        let count = self.counter.fetch_add(1, Ordering::Relaxed);
+
+        let comp = GpuObject::new_prealloc(
+            kalloc.private.alloc_object()?,
+            |ptr: GpuWeakPointer<fw::compute::RunCompute::ver>| {
+                let mut builder = microseq::Builder::new();
+
+                let stats = gpu.initdata.runtime_pointers.stats.comp.weak_pointer();
+
+                let start_comp = builder.add(microseq::StartCompute::ver {
+                    header: microseq::op::StartCompute::HEADER,
+                    unk_pointer: inner_weak_ptr!(ptr, unk_pointee),
+                    job_params1: inner_weak_ptr!(ptr, job_params1),
+                    stats,
+                    work_queue: ev_comp.info_ptr,
+                    vm_slot: vm_bind.slot(),
+                    unk_28: 0x1,
+                    event_generation: self.id as u32,
+                    cmd_seq: U64(ev_comp.cmd_seq),
+                    unk_38: 0x0,
+                    job_params2: inner_weak_ptr!(ptr, job_params2),
+                    unk_44: 0x0,
+                    uuid,
+                    attachments: common::build_attachments(
+                        cmdbuf.attachments,
+                        cmdbuf.attachment_count,
+                    )?,
+                    padding: Default::default(),
+                    #[ver(V >= V13_0B4)]
+                    unk_flag: inner_weak_ptr!(ptr, unk_flag),
+                    #[ver(V >= V13_0B4)]
+                    counter: U64(count),
+                    #[ver(V >= V13_0B4)]
+                    notifier_buf: inner_weak_ptr!(notifier.weak_pointer(), state.unk_buf),
+                })?;
+
+                if result_writer.is_some() {
+                    builder.add(microseq::Timestamp::ver {
+                        header: microseq::op::Timestamp::new(true),
+                        cur_ts: inner_weak_ptr!(ptr, cur_ts),
+                        start_ts: inner_weak_ptr!(ptr, start_ts),
+                        update_ts: inner_weak_ptr!(ptr, start_ts),
+                        work_queue: ev_comp.info_ptr,
+                        unk_24: U64(0),
+                        #[ver(V >= V13_0B4)]
+                        unk_ts: inner_weak_ptr!(ptr, unk_ts),
+                        uuid,
+                        unk_30_padding: 0,
+                    })?;
+                }
+
+                builder.add(microseq::WaitForIdle {
+                    header: microseq::op::WaitForIdle::new(microseq::Pipe::Compute),
+                })?;
+
+                if result_writer.is_some() {
+                    builder.add(microseq::Timestamp::ver {
+                        header: microseq::op::Timestamp::new(false),
+                        cur_ts: inner_weak_ptr!(ptr, cur_ts),
+                        start_ts: inner_weak_ptr!(ptr, start_ts),
+                        update_ts: inner_weak_ptr!(ptr, end_ts),
+                        work_queue: ev_comp.info_ptr,
+                        unk_24: U64(0),
+                        #[ver(V >= V13_0B4)]
+                        unk_ts: inner_weak_ptr!(ptr, unk_ts),
+                        uuid,
+                        unk_30_padding: 0,
+                    })?;
+                }
+
+                let off = builder.offset_to(start_comp);
+                builder.add(microseq::FinalizeCompute::ver {
+                    header: microseq::op::FinalizeCompute::HEADER,
+                    stats,
+                    work_queue: ev_comp.info_ptr,
+                    vm_slot: vm_bind.slot(),
+                    #[ver(V < V13_0B4)]
+                    unk_18: 0,
+                    job_params2: inner_weak_ptr!(ptr, job_params2),
+                    unk_24: 0,
+                    uuid,
+                    fw_stamp: ev_comp.fw_stamp_pointer,
+                    stamp_value: ev_comp.value.next(),
+                    unk_38: 0,
+                    unk_3c: 0,
+                    unk_40: 0,
+                    unk_44: 0,
+                    unk_48: 0,
+                    unk_4c: 0,
+                    unk_50: 0,
+                    unk_54: 0,
+                    unk_58: 0,
+                    #[ver(G == G14 && V < V13_0B4)]
+                    unk_5c_g14: U64(0),
+                    restart_branch_offset: off,
+                    unk_60: unk3.into(),
+                    #[ver(V >= V13_0B4)]
+                    unk_64: Default::default(),
+                    #[ver(V >= V13_0B4)]
+                    unk_flag: inner_weak_ptr!(ptr, unk_flag),
+                    #[ver(V >= V13_0B4)]
+                    unk_79: Default::default(),
+                })?;
+
+                builder.add(microseq::RetireStamp {
+                    header: microseq::op::RetireStamp::HEADER,
+                })?;
+
+                Ok(box_in_place!(fw::compute::RunCompute::ver {
+                    notifier: notifier.clone(),
+                    preempt_buf: preempt_buf,
+                    seq_buf: seq_buf,
+                    micro_seq: builder.build(&mut kalloc.private)?,
+                    vm_bind: vm_bind.clone(),
+                    timestamps: timestamps.clone(),
+                })?)
+            },
+            |inner, ptr| {
+                Ok(place!(
+                    ptr,
+                    fw::compute::raw::RunCompute::ver {
+                        tag: fw::workqueue::CommandType::RunCompute,
+                        #[ver(V >= V13_0B4)]
+                        counter: U64(count),
+                        unk_4: 0,
+                        vm_slot: vm_bind.slot(),
+                        notifier: inner.notifier.gpu_pointer(),
+                        unk_pointee: Default::default(),
+                        job_params1: fw::compute::raw::JobParameters1 {
+                            preempt_buf1: inner.preempt_buf.gpu_pointer(),
+                            encoder: U64(cmdbuf.encoder_ptr),
+                            // buf2-5 Only if internal program is used
+                            preempt_buf2: inner.preempt_buf.gpu_offset_pointer(preempt2_off),
+                            preempt_buf3: inner.preempt_buf.gpu_offset_pointer(preempt3_off),
+                            preempt_buf4: inner.preempt_buf.gpu_offset_pointer(preempt4_off),
+                            preempt_buf5: inner.preempt_buf.gpu_offset_pointer(preempt5_off),
+                            pipeline_base: U64(0x11_00000000),
+                            unk_38: U64(0x8c60),
+                            unk_40: cmdbuf.ctx_switch_prog, // Internal program addr | 1
+                            unk_44: 0,
+                            compute_layout_addr: U64(cmdbuf.buffer_descriptor), // Only if internal program used
+                            unk_50: cmdbuf.buffer_descriptor_size, // 0x40 if internal program used
+                            unk_54: 0,
+                            unk_58: 1,
+                            unk_5c: 0,
+                            iogpu_unk_40: cmdbuf.iogpu_unk_40, // 0x1c if internal program used
+                        },
+                        unk_b8: Default::default(),
+                        microsequence: inner.micro_seq.gpu_pointer(),
+                        microsequence_size: inner.micro_seq.len() as u32,
+                        job_params2: fw::compute::raw::JobParameters2::ver {
+                            #[ver(V >= V13_0B4)]
+                            unk_0_0: 0,
+                            unk_0: Default::default(),
+                            preempt_buf1: inner.preempt_buf.gpu_pointer(),
+                            encoder_end: U64(cmdbuf.encoder_end),
+                            unk_34: Default::default(),
+                            #[ver(V < V13_0B4)]
+                            unk_5c: 0,
+                        },
+                        encoder_params: fw::job::raw::EncoderParams {
+                            unk_8: 0x0,  // fixed
+                            unk_c: 0x0,  // fixed
+                            unk_10: 0x0, // fixed
+                            encoder_id: cmdbuf.encoder_id,
+                            unk_18: 0x0, // fixed
+                            iogpu_compute_unk44: cmdbuf.iogpu_unk_44,
+                            seq_buffer: inner.seq_buf.gpu_pointer(),
+                            unk_28: U64(0x0), // fixed
+                        },
+                        meta: fw::job::raw::JobMeta {
+                            unk_4: 0,
+                            stamp: ev_comp.stamp_pointer,
+                            fw_stamp: ev_comp.fw_stamp_pointer,
+                            stamp_value: ev_comp.value.next(),
+                            stamp_slot: ev_comp.slot,
+                            evctl_index: 0, // fixed
+                            flush_stamps: flush_stamps as u32,
+                            uuid: uuid,
+                            cmd_seq: ev_comp.cmd_seq as u32,
+                        },
+                        cur_ts: U64(0),
+                        start_ts: Some(inner_ptr!(inner.timestamps.gpu_pointer(), start)),
+                        end_ts: Some(inner_ptr!(inner.timestamps.gpu_pointer(), end)),
+                        unk_2c0: 0,
+                        unk_2c4: 0,
+                        unk_2c8: 0,
+                        unk_2cc: 0,
+                        client_sequence: slot_client_seq,
+                        pad_2d1: Default::default(),
+                        unk_2d4: 0,
+                        unk_2d8: 0,
+                        #[ver(V >= V13_0B4)]
+                        unk_ts: U64(0),
+                        #[ver(V >= V13_0B4)]
+                        unk_2e1: Default::default(),
+                        #[ver(V >= V13_0B4)]
+                        unk_flag: U32(0),
+                        #[ver(V >= V13_0B4)]
+                        unk_pad: Default::default(),
+                    }
+                ))
+            },
+        )?;
+
+        core::mem::drop(alloc);
+
+        fence.add_command();
+        comp_job.add_cb(comp, vm_bind.slot(), move |cmd, error| {
+            if let Some(err) = error {
+                fence.set_error(err.into())
+            }
+            if let Some(mut rw) = result_writer {
+                let mut result: bindings::drm_asahi_result_compute = Default::default();
+
+                cmd.timestamps.with(|raw, _inner| {
+                    result.ts_start = raw.start.load(Ordering::Relaxed);
+                    result.ts_end = raw.end.load(Ordering::Relaxed);
+                });
+
+                if let Some(err) = error {
+                    result.info = err.into();
+                } else {
+                    result.info.status = bindings::drm_asahi_status_DRM_ASAHI_STATUS_COMPLETE;
+                }
+
+                rw.write(result);
+            }
+
+            fence.command_complete();
+        })?;
+
+        notifier.threshold.with(|raw, _inner| {
+            raw.increment();
+        });
+
+        comp_job.next_seq();
+
+        Ok(())
+    }
+}
diff --git a/drivers/gpu/drm/asahi/queue/mod.rs b/drivers/gpu/drm/asahi/queue/mod.rs
new file mode 100644
index 000000000000..15988af33cf3
--- /dev/null
+++ b/drivers/gpu/drm/asahi/queue/mod.rs
@@ -0,0 +1,725 @@
+// SPDX-License-Identifier: GPL-2.0-only OR MIT
+
+//! Submission queue management
+//!
+//! This module implements the userspace view of submission queues and the logic to map userspace
+//! submissions to firmware queues.
+
+use kernel::dma_fence::*;
+use kernel::prelude::*;
+use kernel::{
+    bindings, c_str, dma_fence,
+    drm::gem::shmem::VMap,
+    drm::sched,
+    macros::versions,
+    sync::{smutex::Mutex, Arc},
+};
+
+use crate::alloc::Allocator;
+use crate::debug::*;
+use crate::driver::AsahiDevice;
+use crate::fw::types::*;
+use crate::gpu::GpuManager;
+use crate::{alloc, buffer, channel, event, file, fw, gem, gpu, mmu, workqueue};
+use crate::{inner_weak_ptr, place};
+
+use core::mem::MaybeUninit;
+use core::sync::atomic::{AtomicU64, Ordering};
+
+const DEBUG_CLASS: DebugFlags = DebugFlags::Queue;
+
+const WQ_SIZE: u32 = 0x500;
+
+mod common;
+mod compute;
+mod render;
+
+/// Trait implemented by all versioned queues.
+pub(crate) trait Queue: Send + Sync {
+    fn submit(
+        &mut self,
+        id: u64,
+        in_syncs: Vec<file::SyncItem>,
+        out_syncs: Vec<file::SyncItem>,
+        result_buf: Option<gem::ObjectRef>,
+        commands: Vec<bindings::drm_asahi_command>,
+    ) -> Result;
+}
+
+#[versions(AGX)]
+struct SubQueue {
+    wq: Arc<workqueue::WorkQueue::ver>,
+}
+
+#[versions(AGX)]
+impl SubQueue::ver {
+    fn new_job(&mut self) -> SubQueueJob::ver {
+        SubQueueJob::ver {
+            wq: self.wq.clone(),
+            job: None,
+        }
+    }
+}
+
+#[versions(AGX)]
+struct SubQueueJob {
+    wq: Arc<workqueue::WorkQueue::ver>,
+    job: Option<workqueue::Job::ver>,
+}
+
+#[versions(AGX)]
+impl SubQueueJob::ver {
+    fn get(&mut self) -> Result<&mut workqueue::Job::ver> {
+        if self.job.is_none() {
+            mod_pr_debug!("SubQueueJob: Creating {:?} job\n", self.wq.pipe_type());
+            self.job.replace(self.wq.new_job()?);
+        }
+        Ok(self.job.as_mut().expect("expected a Job"))
+    }
+
+    fn commit(&mut self) -> Result {
+        match self.job.as_mut() {
+            Some(job) => job.commit(),
+            None => Ok(()),
+        }
+    }
+
+    fn can_submit(&self) -> bool {
+        match self.job.as_ref() {
+            None => true,
+            Some(job) => job.can_submit(),
+        }
+    }
+}
+
+#[versions(AGX)]
+pub(crate) struct Queue {
+    dev: AsahiDevice,
+    _sched: sched::Scheduler<QueueJob::ver>,
+    entity: sched::Entity<QueueJob::ver>,
+    vm: mmu::Vm,
+    ualloc: Arc<Mutex<alloc::DefaultAllocator>>,
+    q_vtx: Option<SubQueue::ver>,
+    q_frag: Option<SubQueue::ver>,
+    q_comp: Option<SubQueue::ver>,
+    buffer: Option<Mutex<buffer::Buffer::ver>>,
+    gpu_context: Arc<workqueue::GpuContext>,
+    notifier_list: Arc<GpuObject<fw::event::NotifierList>>,
+    notifier: Arc<GpuObject<fw::event::Notifier::ver>>,
+    id: u64,
+    fence_ctx: FenceContexts,
+    #[ver(V >= V13_0B4)]
+    counter: AtomicU64,
+}
+
+#[versions(AGX)]
+#[derive(Default)]
+pub(crate) struct JobFence {
+    id: u64,
+    pending: AtomicU64,
+}
+
+#[versions(AGX)]
+impl JobFence::ver {
+    fn add_command(self: &FenceObject<Self>) {
+        self.pending.fetch_add(1, Ordering::Relaxed);
+    }
+
+    fn command_complete(self: &FenceObject<Self>) {
+        let remain = self.pending.fetch_sub(1, Ordering::Relaxed) - 1;
+        mod_pr_debug!(
+            "JobFence[{}]: Command complete (remain: {})\n",
+            self.id,
+            remain
+        );
+        if remain == 0 {
+            mod_pr_debug!("JobFence[{}]: Signaling\n", self.id);
+            if self.signal().is_err() {
+                pr_err!("JobFence[{}]: Fence signal failed\n", self.id);
+            }
+        }
+    }
+}
+
+#[versions(AGX)]
+#[vtable]
+impl dma_fence::FenceOps for JobFence::ver {
+    const USE_64BIT_SEQNO: bool = true;
+
+    fn get_driver_name<'a>(self: &'a FenceObject<Self>) -> &'a CStr {
+        c_str!("asahi")
+    }
+    fn get_timeline_name<'a>(self: &'a FenceObject<Self>) -> &'a CStr {
+        c_str!("queue")
+    }
+}
+
+#[versions(AGX)]
+pub(crate) struct QueueJob {
+    dev: AsahiDevice,
+    vm_bind: mmu::VmBind,
+    op_guard: Option<gpu::OpGuard>,
+    sj_vtx: Option<SubQueueJob::ver>,
+    sj_frag: Option<SubQueueJob::ver>,
+    sj_comp: Option<SubQueueJob::ver>,
+    fence: UserFence<JobFence::ver>,
+    did_run: bool,
+    id: u64,
+}
+
+#[versions(AGX)]
+impl QueueJob::ver {
+    fn get_vtx(&mut self) -> Result<&mut workqueue::Job::ver> {
+        self.sj_vtx.as_mut().ok_or(EINVAL)?.get()
+    }
+    fn get_frag(&mut self) -> Result<&mut workqueue::Job::ver> {
+        self.sj_frag.as_mut().ok_or(EINVAL)?.get()
+    }
+    fn get_comp(&mut self) -> Result<&mut workqueue::Job::ver> {
+        self.sj_comp.as_mut().ok_or(EINVAL)?.get()
+    }
+
+    fn commit(&mut self) -> Result {
+        mod_dev_dbg!(self.dev, "QueueJob: Committing\n");
+
+        self.sj_vtx.as_mut().map(|a| a.commit()).unwrap_or(Ok(()))?;
+        self.sj_frag
+            .as_mut()
+            .map(|a| a.commit())
+            .unwrap_or(Ok(()))?;
+        self.sj_comp.as_mut().map(|a| a.commit()).unwrap_or(Ok(()))
+    }
+}
+
+#[versions(AGX)]
+impl sched::JobImpl for QueueJob::ver {
+    fn can_run(job: &mut sched::Job<Self>) -> bool {
+        mod_dev_dbg!(job.dev, "QueueJob {}: Checking runnability\n", job.id);
+
+        if let Some(sj) = job.sj_vtx.as_ref() {
+            if !sj.can_submit() {
+                mod_dev_dbg!(
+                    job.dev,
+                    "QueueJob {}: Blocking due to vertex queue full\n",
+                    job.id
+                );
+                return false;
+            }
+        }
+        if let Some(sj) = job.sj_frag.as_ref() {
+            if !sj.can_submit() {
+                mod_dev_dbg!(
+                    job.dev,
+                    "QueueJob {}: Blocking due to fragment queue full\n",
+                    job.id
+                );
+                return false;
+            }
+        }
+        if let Some(sj) = job.sj_comp.as_ref() {
+            if !sj.can_submit() {
+                mod_dev_dbg!(
+                    job.dev,
+                    "QueueJob {}: Blocking due to compute queue full\n",
+                    job.id
+                );
+                return false;
+            }
+        }
+        true
+    }
+
+    #[allow(unused_assignments)]
+    fn run(job: &mut sched::Job<Self>) -> Result<Option<dma_fence::Fence>> {
+        mod_dev_dbg!(job.dev, "QueueJob {}: Running Job\n", job.id);
+
+        let dev = job.dev.data();
+        let gpu = match dev
+            .gpu
+            .clone()
+            .arc_as_any()
+            .downcast::<gpu::GpuManager::ver>()
+        {
+            Ok(gpu) => gpu,
+            Err(_) => {
+                dev_crit!(job.dev, "GpuManager mismatched with QueueJob!\n");
+                return Err(EIO);
+            }
+        };
+
+        if job.op_guard.is_none() {
+            job.op_guard = Some(gpu.start_op()?);
+        }
+
+        // First submit all the commands for each queue. This can fail.
+
+        let mut frag_job = None;
+        let mut frag_sub = None;
+        if let Some(sj) = job.sj_frag.as_mut() {
+            frag_job = sj.job.take();
+            if let Some(wqjob) = frag_job.as_mut() {
+                mod_dev_dbg!(job.dev, "QueueJob {}: Submit fragment\n", job.id);
+                frag_sub = Some(wqjob.submit()?);
+            }
+        }
+
+        let mut vtx_job = None;
+        let mut vtx_sub = None;
+        if let Some(sj) = job.sj_vtx.as_mut() {
+            vtx_job = sj.job.take();
+            if let Some(wqjob) = vtx_job.as_mut() {
+                mod_dev_dbg!(job.dev, "QueueJob {}: Submit vertex\n", job.id);
+                vtx_sub = Some(wqjob.submit()?);
+            }
+        }
+
+        let mut comp_job = None;
+        let mut comp_sub = None;
+        if let Some(sj) = job.sj_comp.as_mut() {
+            comp_job = sj.job.take();
+            if let Some(wqjob) = comp_job.as_mut() {
+                mod_dev_dbg!(job.dev, "QueueJob {}: Submit compute\n", job.id);
+                comp_sub = Some(wqjob.submit()?);
+            }
+        }
+
+        // Now we fully commit to running the job
+        mod_dev_dbg!(job.dev, "QueueJob {}: Run fragment\n", job.id);
+        frag_sub.map(|a| gpu.run_job(a)).transpose()?;
+
+        mod_dev_dbg!(job.dev, "QueueJob {}: Run vertex\n", job.id);
+        vtx_sub.map(|a| gpu.run_job(a)).transpose()?;
+
+        mod_dev_dbg!(job.dev, "QueueJob {}: Run compute\n", job.id);
+        comp_sub.map(|a| gpu.run_job(a)).transpose()?;
+
+        mod_dev_dbg!(job.dev, "QueueJob {}: Drop compute job\n", job.id);
+        core::mem::drop(comp_job);
+        mod_dev_dbg!(job.dev, "QueueJob {}: Drop vertex job\n", job.id);
+        core::mem::drop(vtx_job);
+        mod_dev_dbg!(job.dev, "QueueJob {}: Drop fragment job\n", job.id);
+        core::mem::drop(frag_job);
+
+        job.did_run = true;
+
+        Ok(Some(Fence::from_fence(&job.fence)))
+    }
+
+    fn timed_out(job: &mut sched::Job<Self>) -> sched::Status {
+        // FIXME: Handle timeouts properly
+        dev_err!(
+            job.dev,
+            "QueueJob {}: Job timed out on the DRM scheduler, things will probably break (ran: {})\n",
+            job.id, job.did_run
+        );
+        sched::Status::NoDevice
+    }
+}
+
+#[versions(AGX)]
+impl Drop for QueueJob::ver {
+    fn drop(&mut self) {
+        mod_dev_dbg!(self.dev, "QueueJob {}: Dropping\n", self.id);
+    }
+}
+
+struct ResultWriter {
+    vmap: VMap<gem::DriverObject>,
+    offset: usize,
+    len: usize,
+}
+
+impl ResultWriter {
+    fn write<T>(&mut self, mut value: T) {
+        let p: *mut u8 = &mut value as *mut _ as *mut u8;
+        // SAFETY: We know `p` points to a type T of that size, and UAPI types must have
+        // no padding and all bit patterns valid.
+        let slice = unsafe { core::slice::from_raw_parts_mut(p, core::mem::size_of::<T>()) };
+        let len = slice.len().min(self.len);
+        self.vmap.as_mut_slice()[self.offset..self.offset + len].copy_from_slice(&slice[..len]);
+    }
+}
+
+static QUEUE_NAME: &CStr = c_str!("asahi_fence");
+static QUEUE_CLASS_KEY: kernel::sync::LockClassKey = kernel::sync::LockClassKey::new();
+
+#[versions(AGX)]
+impl Queue::ver {
+    /// Create a new user queue.
+    #[allow(clippy::too_many_arguments)]
+    pub(crate) fn new(
+        dev: &AsahiDevice,
+        vm: mmu::Vm,
+        alloc: &mut gpu::KernelAllocators,
+        ualloc: Arc<Mutex<alloc::DefaultAllocator>>,
+        ualloc_priv: Arc<Mutex<alloc::DefaultAllocator>>,
+        event_manager: Arc<event::EventManager>,
+        mgr: &buffer::BufferManager,
+        id: u64,
+        priority: u32,
+        caps: u32,
+    ) -> Result<Queue::ver> {
+        mod_dev_dbg!(dev, "[Queue {}] Creating queue\n", id);
+
+        let data = dev.data();
+
+        let mut notifier_list = alloc.private.new_default::<fw::event::NotifierList>()?;
+
+        let self_ptr = notifier_list.weak_pointer();
+        notifier_list.with_mut(|raw, _inner| {
+            raw.list_head.next = Some(inner_weak_ptr!(self_ptr, list_head));
+        });
+
+        let threshold = alloc.shared.new_default::<fw::event::Threshold>()?;
+
+        let notifier: Arc<GpuObject<fw::event::Notifier::ver>> =
+            Arc::try_new(alloc.private.new_inplace(
+                fw::event::Notifier::ver { threshold },
+                |inner, ptr: &mut MaybeUninit<fw::event::raw::Notifier::ver<'_>>| {
+                    Ok(place!(
+                        ptr,
+                        fw::event::raw::Notifier::ver {
+                            threshold: inner.threshold.gpu_pointer(),
+                            generation: AtomicU32::new(id as u32),
+                            cur_count: AtomicU32::new(0),
+                            unk_10: AtomicU32::new(0x50),
+                            state: Default::default()
+                        }
+                    ))
+                },
+            )?)?;
+
+        let sched = sched::Scheduler::new(dev, WQ_SIZE, 0, 100000, c_str!("asahi_sched"))?;
+        // Priorities are handled by the AGX scheduler, there is no meaning within a
+        // per-queue scheduler.
+        let entity = sched::Entity::new(&sched, sched::Priority::Normal)?;
+
+        let mut ret = Queue::ver {
+            dev: dev.clone(),
+            _sched: sched,
+            entity,
+            vm,
+            ualloc,
+            q_vtx: None,
+            q_frag: None,
+            q_comp: None,
+            buffer: None,
+            gpu_context: Arc::try_new(workqueue::GpuContext::new(dev, alloc)?)?,
+            notifier_list: Arc::try_new(notifier_list)?,
+            notifier,
+            id,
+            fence_ctx: FenceContexts::new(1, QUEUE_NAME, &QUEUE_CLASS_KEY)?,
+            #[ver(V >= V13_0B4)]
+            counter: AtomicU64::new(0),
+        };
+
+        // Rendering structures
+        if caps & bindings::drm_asahi_queue_cap_DRM_ASAHI_QUEUE_CAP_RENDER != 0 {
+            let buffer =
+                buffer::Buffer::ver::new(&*data.gpu, alloc, ret.ualloc.clone(), ualloc_priv, mgr)?;
+            let tvb_blocks = {
+                let lock = crate::THIS_MODULE.kernel_param_lock();
+                *crate::initial_tvb_size.read(&lock)
+            };
+
+            buffer.ensure_blocks(tvb_blocks)?;
+
+            ret.buffer = Some(Mutex::new(buffer));
+            ret.q_vtx = Some(SubQueue::ver {
+                wq: workqueue::WorkQueue::ver::new(
+                    alloc,
+                    event_manager.clone(),
+                    ret.gpu_context.clone(),
+                    ret.notifier_list.clone(),
+                    channel::PipeType::Vertex,
+                    id,
+                    priority,
+                    WQ_SIZE,
+                )?,
+            });
+        }
+
+        // Rendering & blit structures
+        if caps
+            & (bindings::drm_asahi_queue_cap_DRM_ASAHI_QUEUE_CAP_RENDER
+                | bindings::drm_asahi_queue_cap_DRM_ASAHI_QUEUE_CAP_BLIT)
+            != 0
+        {
+            ret.q_frag = Some(SubQueue::ver {
+                wq: workqueue::WorkQueue::ver::new(
+                    alloc,
+                    event_manager.clone(),
+                    ret.gpu_context.clone(),
+                    ret.notifier_list.clone(),
+                    channel::PipeType::Fragment,
+                    id,
+                    priority,
+                    WQ_SIZE,
+                )?,
+            });
+        }
+
+        // Compute structures
+        if caps & bindings::drm_asahi_queue_cap_DRM_ASAHI_QUEUE_CAP_COMPUTE != 0 {
+            ret.q_comp = Some(SubQueue::ver {
+                wq: workqueue::WorkQueue::ver::new(
+                    alloc,
+                    event_manager,
+                    ret.gpu_context.clone(),
+                    ret.notifier_list.clone(),
+                    channel::PipeType::Compute,
+                    id,
+                    priority,
+                    WQ_SIZE,
+                )?,
+            });
+        }
+
+        mod_dev_dbg!(dev, "[Queue {}] Queue created\n", id);
+        Ok(ret)
+    }
+}
+
+const SQ_RENDER: usize = bindings::drm_asahi_subqueue_DRM_ASAHI_SUBQUEUE_RENDER as usize;
+const SQ_COMPUTE: usize = bindings::drm_asahi_subqueue_DRM_ASAHI_SUBQUEUE_COMPUTE as usize;
+const SQ_COUNT: usize = bindings::drm_asahi_subqueue_DRM_ASAHI_SUBQUEUE_COUNT as usize;
+
+#[versions(AGX)]
+impl Queue for Queue::ver {
+    fn submit(
+        &mut self,
+        id: u64,
+        in_syncs: Vec<file::SyncItem>,
+        out_syncs: Vec<file::SyncItem>,
+        result_buf: Option<gem::ObjectRef>,
+        commands: Vec<bindings::drm_asahi_command>,
+    ) -> Result {
+        let dev = self.dev.data();
+        let gpu = match dev
+            .gpu
+            .clone()
+            .arc_as_any()
+            .downcast::<gpu::GpuManager::ver>()
+        {
+            Ok(gpu) => gpu,
+            Err(_) => {
+                dev_crit!(self.dev, "GpuManager mismatched with JobImpl!\n");
+                return Err(EIO);
+            }
+        };
+
+        mod_dev_dbg!(self.dev, "[Submission {}] Submit job\n", id);
+
+        if gpu.is_crashed() {
+            dev_err!(
+                self.dev,
+                "[Submission {}] GPU is crashed, cannot submit\n",
+                id
+            );
+            return Err(ENODEV);
+        }
+
+        // Empty submissions are not legal
+        if commands.is_empty() {
+            return Err(EINVAL);
+        }
+
+        let op_guard = if !in_syncs.is_empty() {
+            Some(gpu.start_op()?)
+        } else {
+            None
+        };
+
+        let mut events: [Vec<Option<workqueue::QueueEventInfo::ver>>; SQ_COUNT] =
+            Default::default();
+
+        events[SQ_RENDER].try_push(self.q_frag.as_ref().and_then(|a| a.wq.event_info()))?;
+        events[SQ_COMPUTE].try_push(self.q_comp.as_ref().and_then(|a| a.wq.event_info()))?;
+
+        let vm_bind = gpu.bind_vm(&self.vm)?;
+        let vm_slot = vm_bind.slot();
+
+        mod_dev_dbg!(self.dev, "[Submission {}] Creating job\n", id);
+        let mut job = self.entity.new_job(QueueJob::ver {
+            dev: self.dev.clone(),
+            vm_bind,
+            op_guard,
+            sj_vtx: self.q_vtx.as_mut().map(|a| a.new_job()),
+            sj_frag: self.q_frag.as_mut().map(|a| a.new_job()),
+            sj_comp: self.q_comp.as_mut().map(|a| a.new_job()),
+            fence: self
+                .fence_ctx
+                .new_fence::<JobFence::ver>(
+                    0,
+                    JobFence::ver {
+                        id,
+                        pending: Default::default(),
+                    },
+                )?
+                .into(),
+            did_run: false,
+            id,
+        })?;
+
+        mod_dev_dbg!(
+            self.dev,
+            "[Submission {}] Adding {} in_syncs\n",
+            id,
+            in_syncs.len()
+        );
+        for sync in in_syncs {
+            job.add_dependency(sync.fence.expect("in_sync missing fence"))?;
+        }
+
+        let mut last_render = None;
+        let mut last_compute = None;
+
+        for (i, cmd) in commands.iter().enumerate() {
+            match cmd.cmd_type {
+                bindings::drm_asahi_cmd_type_DRM_ASAHI_CMD_RENDER => last_render = Some(i),
+                bindings::drm_asahi_cmd_type_DRM_ASAHI_CMD_COMPUTE => last_compute = Some(i),
+                _ => return Err(EINVAL),
+            }
+        }
+
+        mod_dev_dbg!(
+            self.dev,
+            "[Submission {}] Submitting {} commands\n",
+            id,
+            commands.len()
+        );
+        for (i, cmd) in commands.into_iter().enumerate() {
+            for (queue_idx, index) in cmd.barriers.iter().enumerate() {
+                if *index == bindings::DRM_ASAHI_BARRIER_NONE as u32 {
+                    continue;
+                }
+                if let Some(event) = events[queue_idx].get(*index as usize).ok_or(EINVAL)? {
+                    let mut alloc = gpu.alloc();
+                    let queue_job = match cmd.cmd_type {
+                        bindings::drm_asahi_cmd_type_DRM_ASAHI_CMD_RENDER => job.get_vtx()?,
+                        bindings::drm_asahi_cmd_type_DRM_ASAHI_CMD_COMPUTE => job.get_comp()?,
+                        _ => return Err(EINVAL),
+                    };
+                    mod_dev_dbg!(self.dev, "[Submission {}] Create Explicit Barrier\n", id);
+                    let barrier: GpuObject<fw::workqueue::Barrier> = alloc.private.new_inplace(
+                        Default::default(),
+                        |_inner, ptr: &mut MaybeUninit<fw::workqueue::raw::Barrier>| {
+                            Ok(place!(
+                                ptr,
+                                fw::workqueue::raw::Barrier {
+                                    tag: fw::workqueue::CommandType::Barrier,
+                                    wait_stamp: event.fw_stamp_pointer,
+                                    wait_value: event.value,
+                                    wait_slot: event.slot,
+                                    stamp_self: queue_job.event_info().value.next(),
+                                    uuid: 0xffffbbbb,
+                                    unk: 0,
+                                }
+                            ))
+                        },
+                    )?;
+                    mod_dev_dbg!(self.dev, "[Submission {}] Add Explicit Barrier\n", id);
+                    queue_job.add(barrier, vm_slot)?;
+                } else {
+                    assert!(*index == 0);
+                }
+            }
+
+            let result_writer = match result_buf.as_ref() {
+                None => {
+                    if cmd.result_offset != 0 || cmd.result_size != 0 {
+                        return Err(EINVAL);
+                    }
+                    None
+                }
+                Some(buf) => {
+                    if cmd.result_size != 0 {
+                        if cmd
+                            .result_offset
+                            .checked_add(cmd.result_size)
+                            .ok_or(EINVAL)?
+                            > buf.size() as u64
+                        {
+                            return Err(EINVAL);
+                        }
+                        Some(ResultWriter {
+                            vmap: buf.gem.vmap()?,
+                            offset: cmd.result_offset.try_into()?,
+                            len: cmd.result_size.try_into()?,
+                        })
+                    } else {
+                        None
+                    }
+                }
+            };
+
+            match cmd.cmd_type {
+                bindings::drm_asahi_cmd_type_DRM_ASAHI_CMD_RENDER => {
+                    self.submit_render(
+                        &mut job,
+                        &cmd,
+                        result_writer,
+                        id,
+                        last_render.unwrap() == i,
+                    )?;
+                    events[SQ_RENDER].try_push(Some(
+                        job.sj_frag
+                            .as_ref()
+                            .expect("No frag queue?")
+                            .job
+                            .as_ref()
+                            .expect("No frag job?")
+                            .event_info(),
+                    ))?;
+                }
+                bindings::drm_asahi_cmd_type_DRM_ASAHI_CMD_COMPUTE => {
+                    self.submit_compute(
+                        &mut job,
+                        &cmd,
+                        result_writer,
+                        id,
+                        last_compute.unwrap() == i,
+                    )?;
+                    events[SQ_COMPUTE].try_push(Some(
+                        job.sj_comp
+                            .as_ref()
+                            .expect("No comp queue?")
+                            .job
+                            .as_ref()
+                            .expect("No comp job?")
+                            .event_info(),
+                    ))?;
+                }
+                _ => return Err(EINVAL),
+            }
+        }
+
+        mod_dev_dbg!(self.dev, "Queue: Committing job\n");
+        job.commit()?;
+
+        mod_dev_dbg!(self.dev, "Queue: Arming job\n");
+        let job = job.arm();
+        let out_fence = job.fences().finished();
+        mod_dev_dbg!(self.dev, "Queue: Pushing job\n");
+        job.push();
+
+        mod_dev_dbg!(self.dev, "Queue: Adding {} out_syncs\n", out_syncs.len());
+        for mut sync in out_syncs {
+            if let Some(chain) = sync.chain_fence.take() {
+                sync.syncobj
+                    .add_point(chain, &out_fence, sync.timeline_value);
+            } else {
+                sync.syncobj.replace_fence(Some(&out_fence));
+            }
+        }
+
+        Ok(())
+    }
+}
+
+#[versions(AGX)]
+impl Drop for Queue::ver {
+    fn drop(&mut self) {
+        mod_dev_dbg!(self.dev, "[Queue {}] Dropping queue\n", self.id);
+    }
+}
diff --git a/drivers/gpu/drm/asahi/queue/render.rs b/drivers/gpu/drm/asahi/queue/render.rs
new file mode 100644
index 000000000000..318c952df020
--- /dev/null
+++ b/drivers/gpu/drm/asahi/queue/render.rs
@@ -0,0 +1,1173 @@
+// SPDX-License-Identifier: GPL-2.0-only OR MIT
+#![allow(clippy::unusual_byte_groupings)]
+
+//! Render work queue.
+//!
+//! A render queue consists of two underlying WorkQueues, one for vertex and one for fragment work.
+//! This module is in charge of creating all of the firmware structures required to submit 3D
+//! rendering work to the GPU, based on the userspace command buffer.
+
+use super::common;
+use crate::alloc::Allocator;
+use crate::debug::*;
+use crate::fw::types::*;
+use crate::gpu::GpuManager;
+use crate::util::*;
+use crate::workqueue::WorkError;
+use crate::{box_in_place, inner_ptr, inner_weak_ptr, place};
+use crate::{buffer, fw, gpu, microseq, workqueue};
+use core::mem::MaybeUninit;
+use core::sync::atomic::Ordering;
+use kernel::bindings;
+use kernel::dma_fence::RawDmaFence;
+use kernel::drm::sched::Job;
+use kernel::io_buffer::IoBufferReader;
+use kernel::prelude::*;
+use kernel::sync::{smutex::Mutex, Arc};
+use kernel::user_ptr::UserSlicePtr;
+
+const DEBUG_CLASS: DebugFlags = DebugFlags::Render;
+
+/// Tiling/Vertex control bit to disable using more than one GPU cluster. This results in decreased
+/// throughput but also less latency, which is probably desirable for light vertex loads where the
+/// overhead of clustering/merging would exceed the time it takes to just run the job on one
+/// cluster.
+const TILECTL_DISABLE_CLUSTERING: u32 = 1u32 << 0;
+
+struct RenderResult {
+    result: bindings::drm_asahi_result_render,
+    vtx_complete: bool,
+    frag_complete: bool,
+    vtx_error: Option<workqueue::WorkError>,
+    frag_error: Option<workqueue::WorkError>,
+    writer: super::ResultWriter,
+}
+
+impl RenderResult {
+    fn commit(&mut self) {
+        if !self.vtx_complete || !self.frag_complete {
+            return;
+        }
+
+        let mut error = self.vtx_error.take();
+        if let Some(frag_error) = self.frag_error.take() {
+            if error.is_none() || error == Some(WorkError::Killed) {
+                error = Some(frag_error);
+            }
+        }
+
+        if let Some(err) = error {
+            self.result.info = err.into();
+        } else {
+            self.result.info.status = bindings::drm_asahi_status_DRM_ASAHI_STATUS_COMPLETE;
+        }
+
+        self.writer.write(self.result);
+    }
+}
+
+#[versions(AGX)]
+impl super::Queue::ver {
+    /// Get the appropriate tiling parameters for a given userspace command buffer.
+    fn get_tiling_params(
+        cmdbuf: &bindings::drm_asahi_cmd_render,
+        num_clusters: u32,
+    ) -> Result<buffer::TileInfo> {
+        let width: u32 = cmdbuf.fb_width;
+        let height: u32 = cmdbuf.fb_height;
+        let layers: u32 = cmdbuf.layers;
+
+        if width > 65536 || height > 65536 {
+            return Err(EINVAL);
+        }
+
+        if layers == 0 || layers > 2048 {
+            return Err(EINVAL);
+        }
+
+        let tile_width = 32u32;
+        let tile_height = 32u32;
+
+        let utile_width = cmdbuf.utile_width;
+        let utile_height = cmdbuf.utile_height;
+
+        match (utile_width, utile_height) {
+            (32, 32) | (32, 16) | (16, 16) => (),
+            _ => return Err(EINVAL),
+        };
+
+        let utiles_per_tile_x = tile_width / utile_width;
+        let utiles_per_tile_y = tile_height / utile_height;
+
+        let utiles_per_tile = utiles_per_tile_x * utiles_per_tile_y;
+
+        let tiles_x = (width + tile_width - 1) / tile_width;
+        let tiles_y = (height + tile_height - 1) / tile_height;
+        let tiles = tiles_x * tiles_y;
+
+        let mtiles_x = 4u32;
+        let mtiles_y = 4u32;
+        let mtiles = mtiles_x * mtiles_y;
+
+        // TODO: *samples
+        let tiles_per_mtile_x = align(div_ceil(tiles_x, mtiles_x), 4);
+        let tiles_per_mtile_y = align(div_ceil(tiles_y, mtiles_y), 4);
+        let tiles_per_mtile = tiles_per_mtile_x * tiles_per_mtile_y;
+
+        let mtile_x1 = tiles_per_mtile_x;
+        let mtile_x2 = 2 * tiles_per_mtile_x;
+        let mtile_x3 = 3 * tiles_per_mtile_x;
+
+        let mtile_y1 = tiles_per_mtile_y;
+        let mtile_y2 = 2 * tiles_per_mtile_y;
+        let mtile_y3 = 3 * tiles_per_mtile_y;
+
+        let rgn_entry_size = 5;
+        // Macrotile stride in 32-bit words
+        let rgn_size = align(rgn_entry_size * tiles_per_mtile * utiles_per_tile, 4) / 4;
+        let tilemap_size = (4 * rgn_size * mtiles * layers) as usize;
+
+        let tpc_entry_size = 8;
+        // TPC stride in 32-bit words
+        let tpc_mtile_stride = tpc_entry_size * utiles_per_tile * tiles_per_mtile / 4;
+        let tpc_size = (num_clusters * (4 * tpc_mtile_stride * mtiles) * layers) as usize;
+
+        // No idea where this comes from, but it fits what macOS does...
+        // TODO: layers?
+        let meta1_blocks = if num_clusters > 1 {
+            div_ceil(align(tiles_x, 2) * align(tiles_y, 4), 0x1980)
+        } else {
+            0
+        };
+
+        let min_tvb_blocks =
+            div_ceil(tiles_x * tiles_y, 128).max(if num_clusters > 1 { 9 } else { 8 }) as usize;
+
+        // Sometimes clustering seems to use twice the cluster tilemap count
+        // and twice the meta4 size. TODO: Is this random or can we calculate
+        // it somehow??? Does it go higher???
+        let cluster_factor = 2;
+
+        Ok(buffer::TileInfo {
+            tiles_x,
+            tiles_y,
+            tiles,
+            utile_width,
+            utile_height,
+            //mtiles_x,
+            //mtiles_y,
+            tiles_per_mtile_x,
+            tiles_per_mtile_y,
+            //tiles_per_mtile,
+            utiles_per_mtile_x: tiles_per_mtile_x * utiles_per_tile_x,
+            utiles_per_mtile_y: tiles_per_mtile_y * utiles_per_tile_y,
+            //utiles_per_mtile: tiles_per_mtile * utiles_per_tile,
+            tilemap_size,
+            tpc_size,
+            meta1_blocks,
+            min_tvb_blocks,
+            cluster_factor,
+            params: fw::vertex::raw::TilingParameters {
+                rgn_size,
+                unk_4: 0x88,
+                ppp_ctrl: cmdbuf.ppp_ctrl,
+                x_max: (width - 1) as u16,
+                y_max: (height - 1) as u16,
+                te_screen: ((tiles_y - 1) << 12) | (tiles_x - 1),
+                te_mtile1: mtile_x3 | (mtile_x2 << 9) | (mtile_x1 << 18),
+                te_mtile2: mtile_y3 | (mtile_y2 << 9) | (mtile_y1 << 18),
+                tiles_per_mtile,
+                tpc_stride: tpc_mtile_stride,
+                unk_24: 0x100,
+                unk_28: if layers > 1 {
+                    0xe000 | (layers - 1)
+                } else {
+                    0x8000
+                },
+            },
+        })
+    }
+
+    /// Submit work to a render queue.
+    pub(super) fn submit_render(
+        &self,
+        job: &mut Job<super::QueueJob::ver>,
+        cmd: &bindings::drm_asahi_command,
+        result_writer: Option<super::ResultWriter>,
+        id: u64,
+        flush_stamps: bool,
+    ) -> Result {
+        if cmd.cmd_type != bindings::drm_asahi_cmd_type_DRM_ASAHI_CMD_RENDER {
+            return Err(EINVAL);
+        }
+
+        mod_dev_dbg!(self.dev, "[Submission {}] Render!\n", id);
+
+        let mut cmdbuf_reader = unsafe {
+            UserSlicePtr::new(
+                cmd.cmd_buffer as usize as *mut _,
+                core::mem::size_of::<bindings::drm_asahi_cmd_render>(),
+            )
+            .reader()
+        };
+
+        let mut cmdbuf: MaybeUninit<bindings::drm_asahi_cmd_render> = MaybeUninit::uninit();
+        unsafe {
+            cmdbuf_reader.read_raw(
+                cmdbuf.as_mut_ptr() as *mut u8,
+                core::mem::size_of::<bindings::drm_asahi_cmd_render>(),
+            )?;
+        }
+        let cmdbuf = unsafe { cmdbuf.assume_init() };
+
+        if cmdbuf.flags
+            & !(bindings::ASAHI_RENDER_NO_CLEAR_PIPELINE_TEXTURES
+                | bindings::ASAHI_RENDER_SET_WHEN_RELOADING_Z_OR_S
+                | bindings::ASAHI_RENDER_MEMORYLESS_RTS_USED
+                | bindings::ASAHI_RENDER_PROCESS_EMPTY_TILES
+                | bindings::ASAHI_RENDER_NO_VERTEX_CLUSTERING) as u64
+            != 0
+        {
+            return Err(EINVAL);
+        }
+
+        if cmdbuf.flags & bindings::ASAHI_RENDER_MEMORYLESS_RTS_USED as u64 != 0 {
+            // Not supported yet
+            return Err(EINVAL);
+        }
+
+        if cmdbuf.fb_width == 0
+            || cmdbuf.fb_height == 0
+            || cmdbuf.fb_width > 16384
+            || cmdbuf.fb_height > 16384
+        {
+            mod_dev_dbg!(
+                self.dev,
+                "[Submission {}] Invalid dimensions {}x{}\n",
+                id,
+                cmdbuf.fb_width,
+                cmdbuf.fb_height
+            );
+            return Err(EINVAL);
+        }
+
+        let dev = self.dev.data();
+        let gpu = match dev.gpu.as_any().downcast_ref::<gpu::GpuManager::ver>() {
+            Some(gpu) => gpu,
+            None => {
+                dev_crit!(self.dev, "GpuManager mismatched with Queue!\n");
+                return Err(EIO);
+            }
+        };
+
+        let nclusters = gpu.get_dyncfg().id.num_clusters;
+
+        // Can be set to false to disable clustering (for simpler jobs), but then the
+        // core masks below should be adjusted to cover a single rolling cluster.
+        let mut clustering = nclusters > 1;
+
+        if debug_enabled(debug::DebugFlags::DisableClustering)
+            || cmdbuf.flags & bindings::ASAHI_RENDER_NO_VERTEX_CLUSTERING as u64 != 0
+        {
+            clustering = false;
+        }
+
+        #[ver(G < G14)]
+        let tiling_control = {
+            let render_cfg = gpu.get_cfg().render;
+            let mut tiling_control = render_cfg.tiling_control;
+
+            if !clustering {
+                tiling_control |= TILECTL_DISABLE_CLUSTERING;
+            }
+            tiling_control
+        };
+
+        let mut alloc = gpu.alloc();
+        let kalloc = &mut *alloc;
+
+        // This sequence number increases per new client/VM? assigned to some slot,
+        // but it's unclear *which* slot...
+        let slot_client_seq: u8 = (self.id & 0xff) as u8;
+
+        let tile_info = Self::get_tiling_params(&cmdbuf, if clustering { nclusters } else { 1 })?;
+
+        let buffer = self.buffer.as_ref().ok_or(EINVAL)?.lock();
+
+        let scene = Arc::try_new(buffer.new_scene(kalloc, &tile_info)?)?;
+
+        let notifier = self.notifier.clone();
+
+        let tvb_autogrown = buffer.auto_grow()?;
+        if tvb_autogrown {
+            let new_size = buffer.block_count() as usize;
+            cls_dev_dbg!(
+                TVBStats,
+                &self.dev,
+                "[Submission {}] TVB grew to {} bytes ({} blocks) due to overflows\n",
+                id,
+                new_size * buffer::BLOCK_SIZE,
+                new_size,
+            );
+        }
+
+        let tvb_grown = buffer.ensure_blocks(tile_info.min_tvb_blocks)?;
+        if tvb_grown {
+            cls_dev_dbg!(
+                TVBStats,
+                &self.dev,
+                "[Submission {}] TVB grew to {} bytes ({} blocks) due to dimensions ({}x{})\n",
+                id,
+                tile_info.min_tvb_blocks * buffer::BLOCK_SIZE,
+                tile_info.min_tvb_blocks,
+                cmdbuf.fb_width,
+                cmdbuf.fb_height
+            );
+        }
+
+        let vm_bind = job.vm_bind.clone();
+
+        mod_dev_dbg!(
+            self.dev,
+            "[Submission {}] VM slot = {}\n",
+            id,
+            vm_bind.slot()
+        );
+
+        let ev_vtx = job.get_vtx()?.event_info();
+        let ev_frag = job.get_frag()?.event_info();
+
+        mod_dev_dbg!(
+            self.dev,
+            "[Submission {}] Vert event #{} -> {:#x?}\n",
+            id,
+            ev_vtx.slot,
+            ev_vtx.value.next(),
+        );
+        mod_dev_dbg!(
+            self.dev,
+            "[Submission {}] Frag event #{} -> {:#x?}\n",
+            id,
+            ev_frag.slot,
+            ev_frag.value.next(),
+        );
+
+        let uuid_3d = cmdbuf.cmd_3d_id;
+        let uuid_ta = cmdbuf.cmd_ta_id;
+
+        mod_dev_dbg!(
+            self.dev,
+            "[Submission {}] Vert UUID = {:#x?}\n",
+            id,
+            uuid_ta
+        );
+        mod_dev_dbg!(
+            self.dev,
+            "[Submission {}] Frag UUID = {:#x?}\n",
+            id,
+            uuid_3d
+        );
+
+        let fence = job.fence.clone();
+        let frag_job = job.get_frag()?;
+
+        mod_dev_dbg!(self.dev, "[Submission {}] Create Barrier\n", id);
+        let barrier: GpuObject<fw::workqueue::Barrier> = kalloc.private.new_inplace(
+            Default::default(),
+            |_inner, ptr: &mut MaybeUninit<fw::workqueue::raw::Barrier>| {
+                Ok(place!(
+                    ptr,
+                    fw::workqueue::raw::Barrier {
+                        tag: fw::workqueue::CommandType::Barrier,
+                        wait_stamp: ev_vtx.fw_stamp_pointer,
+                        wait_value: ev_vtx.value.next(),
+                        wait_slot: ev_vtx.slot,
+                        stamp_self: ev_frag.value.next(),
+                        uuid: uuid_3d,
+                        unk: 0,
+                    }
+                ))
+            },
+        )?;
+
+        mod_dev_dbg!(self.dev, "[Submission {}] Add Barrier\n", id);
+        frag_job.add(barrier, vm_bind.slot())?;
+
+        let timestamps = Arc::try_new(kalloc.shared.new_default::<fw::job::RenderTimestamps>()?)?;
+
+        let unk1 = debug_enabled(debug::DebugFlags::Debug1);
+        let unk2 = debug_enabled(debug::DebugFlags::Debug2);
+        let unk3 = debug_enabled(debug::DebugFlags::Debug3);
+
+        let mut tile_config: u64 = 0;
+        if !unk1 {
+            tile_config |= 0x280;
+        }
+        if cmdbuf.layers > 1 {
+            tile_config |= 1;
+        }
+        if cmdbuf.flags & bindings::ASAHI_RENDER_PROCESS_EMPTY_TILES as u64 != 0 {
+            tile_config |= 0x10000;
+        }
+
+        let mut utile_config =
+            ((tile_info.utile_width / 16) << 12) | ((tile_info.utile_height / 16) << 14);
+        utile_config |= match cmdbuf.samples {
+            1 => 0,
+            2 => 1,
+            4 => 2,
+            _ => return Err(EINVAL),
+        };
+
+        let frag_result = result_writer
+            .map(|writer| {
+                let mut result = RenderResult {
+                    result: Default::default(),
+                    vtx_complete: false,
+                    frag_complete: false,
+                    vtx_error: None,
+                    frag_error: None,
+                    writer,
+                };
+
+                if tvb_autogrown {
+                    result.result.flags |= bindings::DRM_ASAHI_RESULT_RENDER_TVB_GROW_OVF as u64;
+                }
+                if tvb_grown {
+                    result.result.flags |= bindings::DRM_ASAHI_RESULT_RENDER_TVB_GROW_MIN as u64;
+                }
+                result.result.tvb_size_bytes = buffer.size() as u64;
+
+                Arc::try_new(Mutex::new(result))
+            })
+            .transpose()?;
+
+        let vtx_result = frag_result.clone();
+
+        // TODO: check
+        #[ver(V >= V13_0B4)]
+        let count_frag = self.counter.fetch_add(2, Ordering::Relaxed);
+        #[ver(V >= V13_0B4)]
+        let count_vtx = count_frag + 1;
+
+        mod_dev_dbg!(self.dev, "[Submission {}] Create Frag\n", id);
+        let frag = GpuObject::new_prealloc(
+            kalloc.private.alloc_object()?,
+            |ptr: GpuWeakPointer<fw::fragment::RunFragment::ver>| {
+                let mut builder = microseq::Builder::new();
+
+                let stats = inner_weak_ptr!(
+                    gpu.initdata.runtime_pointers.stats.frag.weak_pointer(),
+                    stats
+                );
+
+                let start_frag = builder.add(microseq::StartFragment::ver {
+                    header: microseq::op::StartFragment::HEADER,
+                    job_params2: inner_weak_ptr!(ptr, job_params2),
+                    job_params1: inner_weak_ptr!(ptr, job_params1),
+                    scene: scene.gpu_pointer(),
+                    stats,
+                    busy_flag: inner_weak_ptr!(ptr, busy_flag),
+                    tvb_overflow_count: inner_weak_ptr!(ptr, tvb_overflow_count),
+                    unk_pointer: inner_weak_ptr!(ptr, unk_pointee),
+                    work_queue: ev_frag.info_ptr,
+                    work_item: ptr,
+                    vm_slot: vm_bind.slot(),
+                    unk_50: 0x1, // fixed
+                    event_generation: self.id as u32,
+                    buffer_slot: scene.slot(),
+                    unk_5c: 0,
+                    cmd_seq: U64(ev_frag.cmd_seq),
+                    unk_68: 0,
+                    unk_758_flag: inner_weak_ptr!(ptr, unk_758_flag),
+                    unk_job_buf: inner_weak_ptr!(ptr, unk_buf_0),
+                    unk_7c: 0,
+                    unk_80: 0,
+                    unk_84: 0,
+                    uuid: uuid_3d,
+                    attachments: common::build_attachments(
+                        cmdbuf.attachments,
+                        cmdbuf.attachment_count,
+                    )?,
+                    unk_190: 0,
+                    #[ver(V >= V13_0B4)]
+                    counter: U64(count_frag),
+                    #[ver(V >= V13_0B4)]
+                    notifier_buf: inner_weak_ptr!(notifier.weak_pointer(), state.unk_buf),
+                })?;
+
+                if frag_result.is_some() {
+                    builder.add(microseq::Timestamp::ver {
+                        header: microseq::op::Timestamp::new(true),
+                        cur_ts: inner_weak_ptr!(ptr, cur_ts),
+                        start_ts: inner_weak_ptr!(ptr, start_ts),
+                        update_ts: inner_weak_ptr!(ptr, start_ts),
+                        work_queue: ev_frag.info_ptr,
+                        unk_24: U64(0),
+                        #[ver(V >= V13_0B4)]
+                        unk_ts: inner_weak_ptr!(ptr, unk_ts),
+                        uuid: uuid_3d,
+                        unk_30_padding: 0,
+                    })?;
+                }
+
+                builder.add(microseq::WaitForIdle {
+                    header: microseq::op::WaitForIdle::new(microseq::Pipe::Fragment),
+                })?;
+
+                if frag_result.is_some() {
+                    builder.add(microseq::Timestamp::ver {
+                        header: microseq::op::Timestamp::new(false),
+                        cur_ts: inner_weak_ptr!(ptr, cur_ts),
+                        start_ts: inner_weak_ptr!(ptr, start_ts),
+                        update_ts: inner_weak_ptr!(ptr, end_ts),
+                        work_queue: ev_frag.info_ptr,
+                        unk_24: U64(0),
+                        #[ver(V >= V13_0B4)]
+                        unk_ts: inner_weak_ptr!(ptr, unk_ts),
+                        uuid: uuid_3d,
+                        unk_30_padding: 0,
+                    })?;
+                }
+
+                let off = builder.offset_to(start_frag);
+                builder.add(microseq::FinalizeFragment::ver {
+                    header: microseq::op::FinalizeFragment::HEADER,
+                    uuid: uuid_3d,
+                    unk_8: 0,
+                    fw_stamp: ev_frag.fw_stamp_pointer,
+                    stamp_value: ev_frag.value.next(),
+                    unk_18: 0,
+                    scene: scene.weak_pointer(),
+                    buffer: scene.weak_buffer_pointer(),
+                    unk_2c: U64(1),
+                    stats,
+                    unk_pointer: inner_weak_ptr!(ptr, unk_pointee),
+                    busy_flag: inner_weak_ptr!(ptr, busy_flag),
+                    work_queue: ev_frag.info_ptr,
+                    work_item: ptr,
+                    vm_slot: vm_bind.slot(),
+                    unk_60: 0,
+                    unk_758_flag: inner_weak_ptr!(ptr, unk_758_flag),
+                    unk_6c: U64(0),
+                    unk_74: U64(0),
+                    unk_7c: U64(0),
+                    unk_84: U64(0),
+                    unk_8c: U64(0),
+                    #[ver(G == G14 && V < V13_0B4)]
+                    unk_8c_g14: U64(0),
+                    restart_branch_offset: off,
+                    unk_98: unk3.into(),
+                    #[ver(V >= V13_0B4)]
+                    unk_9c: Default::default(),
+                })?;
+
+                builder.add(microseq::RetireStamp {
+                    header: microseq::op::RetireStamp::HEADER,
+                })?;
+
+                Ok(box_in_place!(fw::fragment::RunFragment::ver {
+                    notifier: notifier.clone(),
+                    scene: scene.clone(),
+                    micro_seq: builder.build(&mut kalloc.private)?,
+                    vm_bind: vm_bind.clone(),
+                    aux_fb: self.ualloc.lock().array_empty(0x8000)?,
+                    timestamps: timestamps.clone(),
+                })?)
+            },
+            |inner, ptr| {
+                let aux_fb_info = fw::fragment::raw::AuxFBInfo::ver {
+                    iogpu_unk_214: cmdbuf.iogpu_unk_214,
+                    unk2: 0,
+                    width: cmdbuf.fb_width,
+                    height: cmdbuf.fb_height,
+                    #[ver(V >= V13_0B4)]
+                    unk3: U64(0x100000),
+                };
+
+                Ok(place!(
+                    ptr,
+                    fw::fragment::raw::RunFragment::ver {
+                        tag: fw::workqueue::CommandType::RunFragment,
+                        #[ver(V >= V13_0B4)]
+                        counter: U64(count_frag),
+                        vm_slot: vm_bind.slot(),
+                        unk_8: 0,
+                        microsequence: inner.micro_seq.gpu_pointer(),
+                        microsequence_size: inner.micro_seq.len() as u32,
+                        notifier: inner.notifier.gpu_pointer(),
+                        buffer: inner.scene.buffer_pointer(),
+                        scene: inner.scene.gpu_pointer(),
+                        unk_buffer_buf: inner.scene.kernel_buffer_pointer(),
+                        tvb_tilemap: inner.scene.tvb_tilemap_pointer(),
+                        ppp_multisamplectl: U64(cmdbuf.ppp_multisamplectl),
+                        samples: cmdbuf.samples,
+                        tiles_per_mtile_y: tile_info.tiles_per_mtile_y as u16,
+                        tiles_per_mtile_x: tile_info.tiles_per_mtile_x as u16,
+                        unk_50: U64(0),
+                        unk_58: U64(0),
+                        merge_upper_x: F32::from_bits(cmdbuf.merge_upper_x),
+                        merge_upper_y: F32::from_bits(cmdbuf.merge_upper_y),
+                        unk_68: U64(0),
+                        tile_count: U64(tile_info.tiles as u64),
+                        job_params1: fw::fragment::raw::JobParameters1::ver {
+                            utile_config: utile_config,
+                            unk_4: 0,
+                            clear_pipeline: fw::fragment::raw::ClearPipelineBinding {
+                                pipeline_bind: U64(cmdbuf.load_pipeline_bind as u64),
+                                address: U64(cmdbuf.load_pipeline as u64),
+                            },
+                            ppp_multisamplectl: U64(cmdbuf.ppp_multisamplectl),
+                            scissor_array: U64(cmdbuf.scissor_array),
+                            depth_bias_array: U64(cmdbuf.depth_bias_array),
+                            aux_fb_info: aux_fb_info,
+                            depth_dimensions: U64(cmdbuf.depth_dimensions as u64),
+                            visibility_result_buffer: U64(cmdbuf.visibility_result_buffer),
+                            zls_ctrl: U64(cmdbuf.zls_ctrl),
+                            #[ver(G >= G14)]
+                            unk_58_g14_0: U64(0x4040404),
+                            #[ver(G >= G14)]
+                            unk_58_g14_8: U64(0),
+                            depth_buffer_ptr1: U64(cmdbuf.depth_buffer_1),
+                            depth_buffer_ptr2: U64(cmdbuf.depth_buffer_2),
+                            stencil_buffer_ptr1: U64(cmdbuf.stencil_buffer_1),
+                            stencil_buffer_ptr2: U64(cmdbuf.stencil_buffer_2),
+                            #[ver(G >= G14)]
+                            unk_68_g14_0: Default::default(),
+                            unk_78: Default::default(),
+                            depth_meta_buffer_ptr1: U64(cmdbuf.depth_meta_buffer_1),
+                            unk_a0: Default::default(),
+                            depth_meta_buffer_ptr2: U64(cmdbuf.depth_meta_buffer_2),
+                            unk_b0: Default::default(),
+                            stencil_meta_buffer_ptr1: U64(cmdbuf.stencil_meta_buffer_1),
+                            unk_c0: Default::default(),
+                            stencil_meta_buffer_ptr2: U64(cmdbuf.stencil_meta_buffer_2),
+                            unk_d0: Default::default(),
+                            tvb_tilemap: inner.scene.tvb_tilemap_pointer(),
+                            tvb_heapmeta: inner.scene.tvb_heapmeta_pointer(),
+                            mtile_stride_dwords: U64((4 * tile_info.params.rgn_size as u64) << 24),
+                            tvb_heapmeta_2: inner.scene.tvb_heapmeta_pointer(),
+                            tile_config: U64(tile_config),
+                            aux_fb: inner.aux_fb.gpu_pointer(),
+                            unk_108: Default::default(),
+                            pipeline_base: U64(0x11_00000000),
+                            unk_140: U64(0x8c60),
+                            unk_148: U64(0x0),
+                            unk_150: U64(0x0),
+                            unk_158: U64(0x1c),
+                            unk_160: U64(0),
+                            unk_168_padding: Default::default(),
+                            #[ver(V < V13_0B4)]
+                            __pad0: Default::default(),
+                        },
+                        job_params2: fw::fragment::raw::JobParameters2 {
+                            store_pipeline_bind: cmdbuf.store_pipeline_bind,
+                            store_pipeline_addr: cmdbuf.store_pipeline,
+                            unk_8: 0x0,
+                            unk_c: 0x0,
+                            merge_upper_x: F32::from_bits(cmdbuf.merge_upper_x),
+                            merge_upper_y: F32::from_bits(cmdbuf.merge_upper_y),
+                            unk_18: U64(0x0),
+                            utiles_per_mtile_y: tile_info.utiles_per_mtile_y as u16,
+                            utiles_per_mtile_x: tile_info.utiles_per_mtile_x as u16,
+                            unk_24: 0x0,
+                            tile_counts: ((tile_info.tiles_y - 1) << 12) | (tile_info.tiles_x - 1),
+                            iogpu_unk_212: cmdbuf.iogpu_unk_212,
+                            isp_bgobjdepth: cmdbuf.isp_bgobjdepth,
+                            // TODO: does this flag need to be exposed to userspace?
+                            isp_bgobjvals: cmdbuf.isp_bgobjvals | 0x400,
+                            unk_38: 0x0,
+                            unk_3c: 0x1,
+                            unk_40: 0,
+                        },
+                        job_params3: fw::fragment::raw::JobParameters3::ver {
+                            unk_44_padding: Default::default(),
+                            depth_bias_array: fw::fragment::raw::ArrayAddr {
+                                ptr: U64(cmdbuf.depth_bias_array),
+                                unk_padding: U64(0),
+                            },
+                            scissor_array: fw::fragment::raw::ArrayAddr {
+                                ptr: U64(cmdbuf.scissor_array),
+                                unk_padding: U64(0),
+                            },
+                            visibility_result_buffer: U64(cmdbuf.visibility_result_buffer),
+                            unk_118: U64(0x0),
+                            unk_120: Default::default(),
+                            unk_reload_pipeline: fw::fragment::raw::ClearPipelineBinding {
+                                pipeline_bind: U64(cmdbuf.partial_reload_pipeline_bind as u64),
+                                address: U64(cmdbuf.partial_reload_pipeline as u64),
+                            },
+                            unk_258: U64(0),
+                            unk_260: U64(0),
+                            unk_268: U64(0),
+                            unk_270: U64(0),
+                            reload_pipeline: fw::fragment::raw::ClearPipelineBinding {
+                                pipeline_bind: U64(cmdbuf.partial_reload_pipeline_bind as u64),
+                                address: U64(cmdbuf.partial_reload_pipeline as u64),
+                            },
+                            zls_ctrl: U64(cmdbuf.zls_ctrl),
+                            unk_290: U64(0x0),
+                            depth_buffer_ptr1: U64(cmdbuf.depth_buffer_1),
+                            unk_2a0: U64(0x0),
+                            unk_2a8: U64(0x0),
+                            depth_buffer_ptr2: U64(cmdbuf.depth_buffer_2),
+                            depth_buffer_ptr3: U64(cmdbuf.depth_buffer_3),
+                            depth_meta_buffer_ptr3: U64(cmdbuf.depth_meta_buffer_3),
+                            stencil_buffer_ptr1: U64(cmdbuf.stencil_buffer_1),
+                            unk_2d0: U64(0x0),
+                            unk_2d8: U64(0x0),
+                            stencil_buffer_ptr2: U64(cmdbuf.stencil_buffer_2),
+                            stencil_buffer_ptr3: U64(cmdbuf.stencil_buffer_3),
+                            stencil_meta_buffer_ptr3: U64(cmdbuf.stencil_meta_buffer_3),
+                            unk_2f8: Default::default(),
+                            iogpu_unk_212: cmdbuf.iogpu_unk_212,
+                            unk_30c: 0x0,
+                            aux_fb_info: aux_fb_info,
+                            unk_320_padding: Default::default(),
+                            unk_partial_store_pipeline:
+                                fw::fragment::raw::StorePipelineBinding::new(
+                                    cmdbuf.partial_store_pipeline_bind,
+                                    cmdbuf.partial_store_pipeline
+                                ),
+                            partial_store_pipeline: fw::fragment::raw::StorePipelineBinding::new(
+                                cmdbuf.partial_store_pipeline_bind,
+                                cmdbuf.partial_store_pipeline
+                            ),
+                            isp_bgobjdepth: cmdbuf.isp_bgobjdepth,
+                            isp_bgobjvals: cmdbuf.isp_bgobjvals,
+                            iogpu_unk_49: cmdbuf.iogpu_unk_49,
+                            unk_37c: 0x0,
+                            unk_380: U64(0x0),
+                            unk_388: U64(0x0),
+                            #[ver(V >= V13_0B4)]
+                            unk_390_0: U64(0x0),
+                            depth_dimensions: U64(cmdbuf.depth_dimensions as u64),
+                        },
+                        unk_758_flag: 0,
+                        unk_75c_flag: 0,
+                        unk_buf: Default::default(),
+                        busy_flag: 0,
+                        tvb_overflow_count: 0,
+                        unk_878: 0,
+                        encoder_params: fw::job::raw::EncoderParams {
+                            unk_8: (cmdbuf.flags
+                                & bindings::ASAHI_RENDER_SET_WHEN_RELOADING_Z_OR_S as u64
+                                != 0) as u32,
+                            unk_c: 0x0,  // fixed
+                            unk_10: 0x0, // fixed
+                            encoder_id: cmdbuf.encoder_id,
+                            unk_18: 0x0, // fixed
+                            iogpu_compute_unk44: 0xffffffff,
+                            seq_buffer: inner.scene.seq_buf_pointer(),
+                            unk_28: U64(0x0), // fixed
+                        },
+                        process_empty_tiles: (cmdbuf.flags
+                            & bindings::ASAHI_RENDER_PROCESS_EMPTY_TILES as u64
+                            != 0) as u32,
+                        no_clear_pipeline_textures: (cmdbuf.flags
+                            & bindings::ASAHI_RENDER_NO_CLEAR_PIPELINE_TEXTURES as u64
+                            != 0) as u32,
+                        unk_param: unk2.into(), // 1 for boot stuff?
+                        unk_pointee: 0,
+                        meta: fw::job::raw::JobMeta {
+                            unk_4: 0,
+                            stamp: ev_frag.stamp_pointer,
+                            fw_stamp: ev_frag.fw_stamp_pointer,
+                            stamp_value: ev_frag.value.next(),
+                            stamp_slot: ev_frag.slot,
+                            evctl_index: 0, // fixed
+                            flush_stamps: flush_stamps as u32,
+                            uuid: uuid_3d,
+                            cmd_seq: ev_frag.cmd_seq as u32,
+                        },
+                        unk_after_meta: unk1.into(),
+                        unk_buf_0: U64(0),
+                        unk_buf_8: U64(0),
+                        unk_buf_10: U64(1),
+                        cur_ts: U64(0),
+                        start_ts: Some(inner_ptr!(inner.timestamps.gpu_pointer(), frag.start)),
+                        end_ts: Some(inner_ptr!(inner.timestamps.gpu_pointer(), frag.end)),
+                        unk_914: 0,
+                        unk_918: U64(0),
+                        unk_920: 0,
+                        client_sequence: slot_client_seq,
+                        pad_925: Default::default(),
+                        unk_928: 0,
+                        unk_92c: 0,
+                        #[ver(V >= V13_0B4)]
+                        unk_ts: U64(0),
+                        #[ver(V >= V13_0B4)]
+                        unk_92d_8: Default::default(),
+                    }
+                ))
+            },
+        )?;
+
+        mod_dev_dbg!(self.dev, "[Submission {}] Add Frag\n", id);
+        fence.add_command();
+
+        frag_job.add_cb(frag, vm_bind.slot(), move |cmd, error| {
+            if let Some(err) = error {
+                fence.set_error(err.into());
+            }
+            if let Some(mut res) = frag_result.as_ref().map(|a| a.lock()) {
+                cmd.timestamps.with(|raw, _inner| {
+                    res.result.fragment_ts_start = raw.frag.start.load(Ordering::Relaxed);
+                    res.result.fragment_ts_end = raw.frag.end.load(Ordering::Relaxed);
+                });
+                cmd.with(|raw, _inner| {
+                    res.result.num_tvb_overflows = raw.tvb_overflow_count;
+                });
+                res.frag_error = error;
+                res.frag_complete = true;
+                res.commit();
+            }
+            fence.command_complete();
+        })?;
+
+        let fence = job.fence.clone();
+        let vtx_job = job.get_vtx()?;
+
+        if scene.rebind() || tvb_grown || tvb_autogrown {
+            mod_dev_dbg!(self.dev, "[Submission {}] Create Bind Buffer\n", id);
+            let bind_buffer = kalloc.private.new_inplace(
+                fw::buffer::InitBuffer::ver {
+                    scene: scene.clone(),
+                },
+                |inner, ptr: &mut MaybeUninit<fw::buffer::raw::InitBuffer::ver<'_>>| {
+                    Ok(place!(
+                        ptr,
+                        fw::buffer::raw::InitBuffer::ver {
+                            tag: fw::workqueue::CommandType::InitBuffer,
+                            vm_slot: vm_bind.slot(),
+                            buffer_slot: inner.scene.slot(),
+                            unk_c: 0,
+                            block_count: buffer.block_count(),
+                            buffer: inner.scene.buffer_pointer(),
+                            stamp_value: ev_vtx.value.next(),
+                        }
+                    ))
+                },
+            )?;
+
+            mod_dev_dbg!(self.dev, "[Submission {}] Add Bind Buffer\n", id);
+            vtx_job.add(bind_buffer, vm_bind.slot())?;
+        }
+
+        mod_dev_dbg!(self.dev, "[Submission {}] Create Vertex\n", id);
+        let vtx = GpuObject::new_prealloc(
+            kalloc.private.alloc_object()?,
+            |ptr: GpuWeakPointer<fw::vertex::RunVertex::ver>| {
+                let mut builder = microseq::Builder::new();
+
+                let stats = inner_weak_ptr!(
+                    gpu.initdata.runtime_pointers.stats.vtx.weak_pointer(),
+                    stats
+                );
+
+                let start_vtx = builder.add(microseq::StartVertex::ver {
+                    header: microseq::op::StartVertex::HEADER,
+                    tiling_params: inner_weak_ptr!(ptr, tiling_params),
+                    job_params1: inner_weak_ptr!(ptr, job_params1),
+                    buffer: scene.weak_buffer_pointer(),
+                    scene: scene.weak_pointer(),
+                    stats,
+                    work_queue: ev_vtx.info_ptr,
+                    vm_slot: vm_bind.slot(),
+                    unk_38: 1, // fixed
+                    event_generation: self.id as u32,
+                    buffer_slot: scene.slot(),
+                    unk_44: 0,
+                    cmd_seq: U64(ev_vtx.cmd_seq),
+                    unk_50: 0,
+                    unk_pointer: inner_weak_ptr!(ptr, unk_pointee),
+                    unk_job_buf: inner_weak_ptr!(ptr, unk_buf_0),
+                    unk_64: 0x0, // fixed
+                    unk_68: unk1.into(),
+                    uuid: uuid_ta,
+                    unk_70: 0x0,                // fixed
+                    unk_74: Default::default(), // fixed
+                    unk_15c: 0x0,               // fixed
+                    unk_160: U64(0x0),          // fixed
+                    unk_168: 0x0,               // fixed
+                    unk_16c: 0x0,               // fixed
+                    unk_170: U64(0x0),          // fixed
+                    #[ver(V >= V13_0B4)]
+                    counter: U64(count_vtx),
+                    #[ver(V >= V13_0B4)]
+                    notifier_buf: inner_weak_ptr!(notifier.weak_pointer(), state.unk_buf),
+                    unk_178: 0x0, // padding?
+                })?;
+
+                if vtx_result.is_some() {
+                    builder.add(microseq::Timestamp::ver {
+                        header: microseq::op::Timestamp::new(true),
+                        cur_ts: inner_weak_ptr!(ptr, cur_ts),
+                        start_ts: inner_weak_ptr!(ptr, start_ts),
+                        update_ts: inner_weak_ptr!(ptr, start_ts),
+                        work_queue: ev_vtx.info_ptr,
+                        unk_24: U64(0),
+                        #[ver(V >= V13_0B4)]
+                        unk_ts: inner_weak_ptr!(ptr, unk_ts),
+                        uuid: uuid_ta,
+                        unk_30_padding: 0,
+                    })?;
+                }
+
+                builder.add(microseq::WaitForIdle {
+                    header: microseq::op::WaitForIdle::new(microseq::Pipe::Vertex),
+                })?;
+
+                if vtx_result.is_some() {
+                    builder.add(microseq::Timestamp::ver {
+                        header: microseq::op::Timestamp::new(false),
+                        cur_ts: inner_weak_ptr!(ptr, cur_ts),
+                        start_ts: inner_weak_ptr!(ptr, start_ts),
+                        update_ts: inner_weak_ptr!(ptr, end_ts),
+                        work_queue: ev_vtx.info_ptr,
+                        unk_24: U64(0),
+                        #[ver(V >= V13_0B4)]
+                        unk_ts: inner_weak_ptr!(ptr, unk_ts),
+                        uuid: uuid_ta,
+                        unk_30_padding: 0,
+                    })?;
+                }
+
+                let off = builder.offset_to(start_vtx);
+                builder.add(microseq::FinalizeVertex::ver {
+                    header: microseq::op::FinalizeVertex::HEADER,
+                    scene: scene.weak_pointer(),
+                    buffer: scene.weak_buffer_pointer(),
+                    stats,
+                    work_queue: ev_vtx.info_ptr,
+                    vm_slot: vm_bind.slot(),
+                    unk_28: 0x0, // fixed
+                    unk_pointer: inner_weak_ptr!(ptr, unk_pointee),
+                    unk_34: 0x0, // fixed
+                    uuid: uuid_ta,
+                    fw_stamp: ev_vtx.fw_stamp_pointer,
+                    stamp_value: ev_vtx.value.next(),
+                    unk_48: U64(0x0), // fixed
+                    unk_50: 0x0,      // fixed
+                    unk_54: 0x0,      // fixed
+                    unk_58: U64(0x0), // fixed
+                    unk_60: 0x0,      // fixed
+                    unk_64: 0x0,      // fixed
+                    unk_68: 0x0,      // fixed
+                    #[ver(G >= G14 && V < V13_0B4)]
+                    unk_68_g14: U64(0),
+                    restart_branch_offset: off,
+                    unk_70: 0x0, // fixed
+                    #[ver(V >= V13_0B4)]
+                    unk_74: Default::default(), // Ventura
+                })?;
+
+                builder.add(microseq::RetireStamp {
+                    header: microseq::op::RetireStamp::HEADER,
+                })?;
+
+                Ok(box_in_place!(fw::vertex::RunVertex::ver {
+                    notifier: notifier,
+                    scene: scene.clone(),
+                    micro_seq: builder.build(&mut kalloc.private)?,
+                    vm_bind: vm_bind.clone(),
+                    timestamps: timestamps,
+                })?)
+            },
+            |inner, ptr| {
+                #[ver(G < G14)]
+                let core_masks = gpu.core_masks_packed();
+                Ok(place!(
+                    ptr,
+                    fw::vertex::raw::RunVertex::ver {
+                        tag: fw::workqueue::CommandType::RunVertex,
+                        #[ver(V >= V13_0B4)]
+                        counter: U64(count_vtx),
+                        vm_slot: vm_bind.slot(),
+                        unk_8: 0,
+                        notifier: inner.notifier.gpu_pointer(),
+                        buffer_slot: inner.scene.slot(),
+                        unk_1c: 0,
+                        buffer: inner.scene.buffer_pointer(),
+                        scene: inner.scene.gpu_pointer(),
+                        unk_buffer_buf: inner.scene.kernel_buffer_pointer(),
+                        unk_34: 0,
+                        job_params1: fw::vertex::raw::JobParameters1::ver {
+                            unk_0: U64(if unk1 { 0 } else { 0x200 }), // sometimes 0
+                            unk_8: f32!(1e-20),                       // fixed
+                            unk_c: f32!(1e-20),                       // fixed
+                            tvb_tilemap: inner.scene.tvb_tilemap_pointer(),
+                            #[ver(G < G14)]
+                            tvb_cluster_tilemaps: inner.scene.cluster_tilemaps_pointer(),
+                            tpc: inner.scene.tpc_pointer(),
+                            tvb_heapmeta: inner
+                                .scene
+                                .tvb_heapmeta_pointer()
+                                .or(0x8000_0000_0000_0000),
+                            iogpu_unk_54: 0x6b0003, // fixed
+                            iogpu_unk_55: 0x3a0012, // fixed
+                            iogpu_unk_56: U64(0x1), // fixed
+                            #[ver(G < G14)]
+                            tvb_cluster_meta1: inner
+                                .scene
+                                .meta_1_pointer()
+                                .map(|x| x.or((tile_info.meta1_blocks as u64) << 50)),
+                            utile_config: utile_config,
+                            unk_4c: 0,
+                            ppp_multisamplectl: U64(cmdbuf.ppp_multisamplectl), // fixed
+                            tvb_heapmeta_2: inner.scene.tvb_heapmeta_pointer(),
+                            #[ver(G < G14)]
+                            unk_60: U64(0x0), // fixed
+                            #[ver(G < G14)]
+                            core_mask: Array::new([
+                                *core_masks.first().unwrap_or(&0),
+                                *core_masks.get(1).unwrap_or(&0),
+                            ]),
+                            preempt_buf1: inner.scene.preempt_buf_1_pointer(),
+                            preempt_buf2: inner.scene.preempt_buf_2_pointer(),
+                            unk_80: U64(0x1), // fixed
+                            preempt_buf3: inner
+                                .scene
+                                .preempt_buf_3_pointer()
+                                .or(0x4_0000_0000_0000), // check
+                            encoder_addr: U64(cmdbuf.encoder_ptr),
+                            #[ver(G < G14)]
+                            tvb_cluster_meta2: inner.scene.meta_2_pointer(),
+                            #[ver(G < G14)]
+                            tvb_cluster_meta3: inner.scene.meta_3_pointer(),
+                            #[ver(G < G14)]
+                            tiling_control: tiling_control,
+                            #[ver(G < G14)]
+                            unk_ac: Default::default(), // fixed
+                            unk_b0: Default::default(), // fixed
+                            pipeline_base: U64(0x11_00000000),
+                            #[ver(G < G14)]
+                            tvb_cluster_meta4: inner
+                                .scene
+                                .meta_4_pointer()
+                                .map(|x| x.or(0x3000_0000_0000_0000)),
+                            #[ver(G < G14)]
+                            unk_f0: U64(0x1c + align(tile_info.meta1_blocks, 4) as u64),
+                            unk_f8: U64(0x8c60),         // fixed
+                            unk_100: Default::default(), // fixed
+                            unk_118: 0x1c,               // fixed
+                            #[ver(G >= G14)]
+                            __pad: Default::default(),
+                        },
+                        unk_154: Default::default(),
+                        tiling_params: tile_info.params,
+                        unk_3e8: Default::default(),
+                        tpc: inner.scene.tpc_pointer(),
+                        tpc_size: U64(tile_info.tpc_size as u64),
+                        microsequence: inner.micro_seq.gpu_pointer(),
+                        microsequence_size: inner.micro_seq.len() as u32,
+                        fragment_stamp_slot: ev_frag.slot,
+                        fragment_stamp_value: ev_frag.value.next(),
+                        unk_pointee: 0,
+                        unk_pad: 0,
+                        job_params2: fw::vertex::raw::JobParameters2 {
+                            unk_480: Default::default(), // fixed
+                            unk_498: U64(0x0),           // fixed
+                            unk_4a0: 0x0,                // fixed
+                            preempt_buf1: inner.scene.preempt_buf_1_pointer(),
+                            unk_4ac: 0x0,      // fixed
+                            unk_4b0: U64(0x0), // fixed
+                            unk_4b8: 0x0,      // fixed
+                            unk_4bc: U64(0x0), // fixed
+                            unk_4c4_padding: Default::default(),
+                            unk_50c: 0x0,      // fixed
+                            unk_510: U64(0x0), // fixed
+                            unk_518: U64(0x0), // fixed
+                            unk_520: U64(0x0), // fixed
+                        },
+                        encoder_params: fw::job::raw::EncoderParams {
+                            unk_8: 0x0,  // fixed
+                            unk_c: 0x0,  // fixed
+                            unk_10: 0x0, // fixed
+                            encoder_id: cmdbuf.encoder_id,
+                            unk_18: 0x0, // fixed
+                            iogpu_compute_unk44: 0xffffffff,
+                            seq_buffer: inner.scene.seq_buf_pointer(),
+                            unk_28: U64(0x0), // fixed
+                        },
+                        unk_55c: 0,
+                        unk_560: 0,
+                        memoryless_rts_used: (cmdbuf.flags
+                            & bindings::ASAHI_RENDER_MEMORYLESS_RTS_USED as u64
+                            != 0) as u32,
+                        unk_568: 0,
+                        unk_56c: 0,
+                        meta: fw::job::raw::JobMeta {
+                            unk_4: 0,
+                            stamp: ev_vtx.stamp_pointer,
+                            fw_stamp: ev_vtx.fw_stamp_pointer,
+                            stamp_value: ev_vtx.value.next(),
+                            stamp_slot: ev_vtx.slot,
+                            evctl_index: 0, // fixed
+                            flush_stamps: flush_stamps as u32,
+                            uuid: uuid_ta,
+                            cmd_seq: ev_vtx.cmd_seq as u32,
+                        },
+                        unk_after_meta: unk1.into(),
+                        unk_buf_0: U64(0),
+                        unk_buf_8: U64(0),
+                        unk_buf_10: U64(0),
+                        cur_ts: U64(0),
+                        start_ts: Some(inner_ptr!(inner.timestamps.gpu_pointer(), vtx.start)),
+                        end_ts: Some(inner_ptr!(inner.timestamps.gpu_pointer(), vtx.end)),
+                        unk_5c4: 0,
+                        unk_5c8: 0,
+                        unk_5cc: 0,
+                        unk_5d0: 0,
+                        client_sequence: slot_client_seq,
+                        pad_5d5: Default::default(),
+                        unk_5d8: 0,
+                        unk_5dc: 0,
+                        #[ver(V >= V13_0B4)]
+                        unk_ts: U64(0),
+                        #[ver(V >= V13_0B4)]
+                        unk_5dd_8: Default::default(),
+                    }
+                ))
+            },
+        )?;
+
+        core::mem::drop(alloc);
+
+        mod_dev_dbg!(self.dev, "[Submission {}] Add Vertex\n", id);
+        fence.add_command();
+        vtx_job.add_cb(vtx, vm_bind.slot(), move |cmd, error| {
+            if let Some(err) = error {
+                fence.set_error(err.into())
+            }
+            if let Some(mut res) = vtx_result.as_ref().map(|a| a.lock()) {
+                cmd.timestamps.with(|raw, _inner| {
+                    res.result.vertex_ts_start = raw.vtx.start.load(Ordering::Relaxed);
+                    res.result.vertex_ts_end = raw.vtx.end.load(Ordering::Relaxed);
+                });
+                res.result.tvb_usage_bytes = cmd.scene.used_bytes() as u64;
+                if cmd.scene.overflowed() {
+                    res.result.flags |= bindings::DRM_ASAHI_RESULT_RENDER_TVB_OVERFLOWED as u64;
+                }
+                res.vtx_error = error;
+                res.vtx_complete = true;
+                res.commit();
+            }
+            fence.command_complete();
+        })?;
+
+        mod_dev_dbg!(self.dev, "[Submission {}] Increment counters\n", id);
+        self.notifier.threshold.with(|raw, _inner| {
+            raw.increment();
+            raw.increment();
+        });
+
+        // TODO: handle rollbacks, move to job submit?
+        buffer.increment();
+
+        job.get_vtx()?.next_seq();
+        job.get_frag()?.next_seq();
+
+        Ok(())
+    }
+}
diff --git a/drivers/gpu/drm/asahi/regs.rs b/drivers/gpu/drm/asahi/regs.rs
new file mode 100644
index 000000000000..019d7214793d
--- /dev/null
+++ b/drivers/gpu/drm/asahi/regs.rs
@@ -0,0 +1,387 @@
+// SPDX-License-Identifier: GPL-2.0-only OR MIT
+
+//! GPU MMIO register abstraction
+//!
+//! Since the vast majority of the interactions with the GPU are brokered through the firmware,
+//! there is very little need to interact directly with GPU MMIO register. This module abstracts
+//! the few operations that require that, mainly reading the MMU fault status, reading GPU ID
+//! information, and starting the GPU firmware coprocessor.
+
+use crate::hw;
+use kernel::{device, io_mem::IoMem, platform, prelude::*};
+
+/// Size of the ASC control MMIO region.
+pub(crate) const ASC_CTL_SIZE: usize = 0x4000;
+
+/// Size of the SGX MMIO region.
+pub(crate) const SGX_SIZE: usize = 0x1000000;
+
+const CPU_CONTROL: usize = 0x44;
+const CPU_RUN: u32 = 0x1 << 4; // BIT(4)
+
+const FAULT_INFO: usize = 0x17030;
+
+const ID_VERSION: usize = 0xd04000;
+const ID_UNK08: usize = 0xd04008;
+const ID_COUNTS_1: usize = 0xd04010;
+const ID_COUNTS_2: usize = 0xd04014;
+const ID_UNK18: usize = 0xd04018;
+const ID_CLUSTERS: usize = 0xd0401c;
+
+const CORE_MASK_0: usize = 0xd01500;
+const CORE_MASK_1: usize = 0xd01514;
+
+/// Enum representing the unit that caused an MMU fault.
+#[allow(non_camel_case_types)]
+#[allow(clippy::upper_case_acronyms)]
+#[derive(Copy, Clone, Debug, Eq, PartialEq)]
+pub(crate) enum FaultUnit {
+    /// Decompress / pixel fetch
+    DCMP(u8),
+    /// USC L1 Cache (device loads/stores)
+    UL1C(u8),
+    /// Compress / pixel store
+    CMP(u8),
+    GSL1(u8),
+    IAP(u8),
+    VCE(u8),
+    /// Tiling Engine
+    TE(u8),
+    RAS(u8),
+    /// Vertex Data Master
+    VDM(u8),
+    PPP(u8),
+    /// ISP Parameter Fetch
+    IPF(u8),
+    IPF_CPF(u8),
+    VF(u8),
+    VF_CPF(u8),
+    /// Depth/Stencil load/store
+    ZLS(u8),
+
+    /// Parameter Management
+    dPM,
+    /// Compute Data Master
+    dCDM_KS(u8),
+    dIPP,
+    dIPP_CS,
+    // Vertex Data Master
+    dVDM_CSD,
+    dVDM_SSD,
+    dVDM_ILF,
+    dVDM_ILD,
+    dRDE(u8),
+    FC,
+    GSL2,
+
+    /// Graphics L2 Cache Control?
+    GL2CC_META(u8),
+    GL2CC_MB,
+
+    /// Parameter Management
+    gPM_SP(u8),
+    /// Vertex Data Master - CSD
+    gVDM_CSD_SP(u8),
+    gVDM_SSD_SP(u8),
+    gVDM_ILF_SP(u8),
+    gVDM_TFP_SP(u8),
+    gVDM_MMB_SP(u8),
+    /// Compute Data Master
+    gCDM_CS_KS0_SP(u8),
+    gCDM_CS_KS1_SP(u8),
+    gCDM_CS_KS2_SP(u8),
+    gCDM_KS0_SP(u8),
+    gCDM_KS1_SP(u8),
+    gCDM_KS2_SP(u8),
+    gIPP_SP(u8),
+    gIPP_CS_SP(u8),
+    gRDE0_SP(u8),
+    gRDE1_SP(u8),
+
+    Unknown(u8),
+}
+
+/// Reason for an MMU fault.
+#[derive(Copy, Clone, Debug, Eq, PartialEq)]
+pub(crate) enum FaultReason {
+    Unmapped,
+    AfFault,
+    WriteOnly,
+    ReadOnly,
+    NoAccess,
+    Unknown(u8),
+}
+
+/// Collection of information about an MMU fault.
+#[derive(Copy, Clone, Debug, Eq, PartialEq)]
+pub(crate) struct FaultInfo {
+    pub(crate) address: u64,
+    pub(crate) sideband: u8,
+    pub(crate) vm_slot: u32,
+    pub(crate) unit_code: u8,
+    pub(crate) unit: FaultUnit,
+    pub(crate) level: u8,
+    pub(crate) unk_5: u8,
+    pub(crate) read: bool,
+    pub(crate) reason: FaultReason,
+}
+
+/// Device resources for this GPU instance.
+pub(crate) struct Resources {
+    dev: device::Device,
+    asc: IoMem<ASC_CTL_SIZE>,
+    sgx: IoMem<SGX_SIZE>,
+}
+
+impl Resources {
+    /// Map the required resources given our platform device.
+    pub(crate) fn new(pdev: &mut platform::Device) -> Result<Resources> {
+        // TODO: add device abstraction to ioremap by name
+        let asc_res = unsafe { pdev.ioremap_resource(0)? };
+        let sgx_res = unsafe { pdev.ioremap_resource(1)? };
+
+        Ok(Resources {
+            // SAFETY: This device does DMA via the UAT IOMMU.
+            dev: device::Device::from_dev(pdev),
+            asc: asc_res,
+            sgx: sgx_res,
+        })
+    }
+
+    fn sgx_read32(&self, off: usize) -> u32 {
+        self.sgx.readl_relaxed(off)
+    }
+
+    /* Not yet used
+    fn sgx_write32(&self, off: usize, val: u32) {
+        self.sgx.writel_relaxed(val, off)
+    }
+    */
+
+    fn sgx_read64(&self, off: usize) -> u64 {
+        self.sgx.readq_relaxed(off)
+    }
+
+    /* Not yet used
+    fn sgx_write64(&self, off: usize, val: u64) {
+        self.sgx.writeq_relaxed(val, off)
+    }
+    */
+
+    /// Initialize the MMIO registers for the GPU.
+    pub(crate) fn init_mmio(&self) -> Result {
+        // Nothing to do for now...
+
+        Ok(())
+    }
+
+    /// Start the ASC coprocessor CPU.
+    pub(crate) fn start_cpu(&self) -> Result {
+        let val = self.asc.readl_relaxed(CPU_CONTROL);
+
+        self.asc.writel_relaxed(val | CPU_RUN, CPU_CONTROL);
+
+        Ok(())
+    }
+
+    /// Get the GPU identification info from registers.
+    ///
+    /// See [`hw::GpuIdConfig`] for the result.
+    pub(crate) fn get_gpu_id(&self) -> Result<hw::GpuIdConfig> {
+        let id_version = self.sgx_read32(ID_VERSION);
+        let id_unk08 = self.sgx_read32(ID_UNK08);
+        let id_counts_1 = self.sgx_read32(ID_COUNTS_1);
+        let id_counts_2 = self.sgx_read32(ID_COUNTS_2);
+        let id_unk18 = self.sgx_read32(ID_UNK18);
+        let id_clusters = self.sgx_read32(ID_CLUSTERS);
+
+        dev_info!(
+            self.dev,
+            "GPU ID registers: {:#x} {:#x} {:#x} {:#x} {:#x} {:#x}\n",
+            id_version,
+            id_unk08,
+            id_counts_1,
+            id_counts_2,
+            id_unk18,
+            id_clusters
+        );
+
+        let core_mask_0 = self.sgx_read32(CORE_MASK_0);
+        let core_mask_1 = self.sgx_read32(CORE_MASK_1);
+        let mut core_mask = (core_mask_0 as u64) | ((core_mask_1 as u64) << 32);
+
+        dev_info!(self.dev, "Core mask: {:#x}\n", core_mask);
+
+        let num_clusters = (id_clusters >> 12) & 0xff;
+        let num_cores = id_counts_1 & 0xff;
+
+        if num_cores * num_clusters > 64 {
+            dev_err!(
+                self.dev,
+                "Too many total cores ({} x {} > 64)\n",
+                num_clusters,
+                num_cores
+            );
+            return Err(ENODEV);
+        }
+
+        let mut core_masks = Vec::new();
+        let mut total_active_cores: u32 = 0;
+
+        let max_core_mask = (1u64 << num_cores) - 1;
+        for _i in 0..num_clusters {
+            let mask = core_mask & max_core_mask;
+            core_masks.try_push(mask as u32)?;
+            core_mask >>= num_cores;
+            total_active_cores += mask.count_ones();
+        }
+        let mut core_masks_packed = Vec::new();
+        core_masks_packed.try_push(core_mask_0)?;
+        if core_mask_1 != 0 {
+            core_masks_packed.try_push(core_mask_1)?;
+        }
+
+        if core_mask != 0 {
+            dev_err!(self.dev, "Leftover core mask: {:#x}\n", core_mask);
+            return Err(EIO);
+        }
+
+        let (gpu_rev, gpu_rev_id) = match (id_version >> 8) & 0xff {
+            0x00 => (hw::GpuRevision::A0, hw::GpuRevisionID::A0),
+            0x01 => (hw::GpuRevision::A1, hw::GpuRevisionID::A1),
+            0x10 => (hw::GpuRevision::B0, hw::GpuRevisionID::B0),
+            0x11 => (hw::GpuRevision::B1, hw::GpuRevisionID::B1),
+            0x20 => (hw::GpuRevision::C0, hw::GpuRevisionID::C0),
+            0x21 => (hw::GpuRevision::C1, hw::GpuRevisionID::C1),
+            a => {
+                dev_err!(self.dev, "Unknown GPU revision {}\n", a);
+                return Err(ENODEV);
+            }
+        };
+
+        Ok(hw::GpuIdConfig {
+            gpu_gen: match (id_version >> 24) & 0xff {
+                4 => hw::GpuGen::G13,
+                5 => hw::GpuGen::G14,
+                a => {
+                    dev_err!(self.dev, "Unknown GPU generation {}\n", a);
+                    return Err(ENODEV);
+                }
+            },
+            gpu_variant: match (id_version >> 16) & 0xff {
+                1 => hw::GpuVariant::P, // Guess
+                2 => hw::GpuVariant::G,
+                3 => hw::GpuVariant::S,
+                4 => {
+                    if num_clusters > 4 {
+                        hw::GpuVariant::D
+                    } else {
+                        hw::GpuVariant::C
+                    }
+                }
+                a => {
+                    dev_err!(self.dev, "Unknown GPU variant {}\n", a);
+                    return Err(ENODEV);
+                }
+            },
+            gpu_rev,
+            gpu_rev_id,
+            max_dies: (id_clusters >> 20) & 0xf,
+            num_clusters,
+            num_cores,
+            num_frags: (id_counts_1 >> 8) & 0xff,
+            num_gps: (id_counts_2 >> 16) & 0xff,
+            total_active_cores,
+            core_masks,
+            core_masks_packed,
+        })
+    }
+
+    /// Get the fault information from the MMU status register, if one occurred.
+    pub(crate) fn get_fault_info(&self) -> Option<FaultInfo> {
+        let fault_info = self.sgx_read64(FAULT_INFO);
+
+        if fault_info & 1 == 0 {
+            return None;
+        }
+
+        let unit_code = ((fault_info >> 9) & 0xff) as u8;
+        let unit = match unit_code {
+            0x00..=0x9f => match unit_code & 0xf {
+                0x0 => FaultUnit::DCMP(unit_code >> 4),
+                0x1 => FaultUnit::UL1C(unit_code >> 4),
+                0x2 => FaultUnit::CMP(unit_code >> 4),
+                0x3 => FaultUnit::GSL1(unit_code >> 4),
+                0x4 => FaultUnit::IAP(unit_code >> 4),
+                0x5 => FaultUnit::VCE(unit_code >> 4),
+                0x6 => FaultUnit::TE(unit_code >> 4),
+                0x7 => FaultUnit::RAS(unit_code >> 4),
+                0x8 => FaultUnit::VDM(unit_code >> 4),
+                0x9 => FaultUnit::PPP(unit_code >> 4),
+                0xa => FaultUnit::IPF(unit_code >> 4),
+                0xb => FaultUnit::IPF_CPF(unit_code >> 4),
+                0xc => FaultUnit::VF(unit_code >> 4),
+                0xd => FaultUnit::VF_CPF(unit_code >> 4),
+                0xe => FaultUnit::ZLS(unit_code >> 4),
+                _ => FaultUnit::Unknown(unit_code),
+            },
+            0xa1 => FaultUnit::dPM,
+            0xa2 => FaultUnit::dCDM_KS(0),
+            0xa3 => FaultUnit::dCDM_KS(1),
+            0xa4 => FaultUnit::dCDM_KS(2),
+            0xa5 => FaultUnit::dIPP,
+            0xa6 => FaultUnit::dIPP_CS,
+            0xa7 => FaultUnit::dVDM_CSD,
+            0xa8 => FaultUnit::dVDM_SSD,
+            0xa9 => FaultUnit::dVDM_ILF,
+            0xaa => FaultUnit::dVDM_ILD,
+            0xab => FaultUnit::dRDE(0),
+            0xac => FaultUnit::dRDE(1),
+            0xad => FaultUnit::FC,
+            0xae => FaultUnit::GSL2,
+            0xb0..=0xb7 => FaultUnit::GL2CC_META(unit_code & 0xf),
+            0xb8 => FaultUnit::GL2CC_MB,
+            0xe0..=0xff => match unit_code & 0xf {
+                0x0 => FaultUnit::gPM_SP((unit_code >> 4) & 1),
+                0x1 => FaultUnit::gVDM_CSD_SP((unit_code >> 4) & 1),
+                0x2 => FaultUnit::gVDM_SSD_SP((unit_code >> 4) & 1),
+                0x3 => FaultUnit::gVDM_ILF_SP((unit_code >> 4) & 1),
+                0x4 => FaultUnit::gVDM_TFP_SP((unit_code >> 4) & 1),
+                0x5 => FaultUnit::gVDM_MMB_SP((unit_code >> 4) & 1),
+                0x6 => FaultUnit::gCDM_CS_KS0_SP((unit_code >> 4) & 1),
+                0x7 => FaultUnit::gCDM_CS_KS1_SP((unit_code >> 4) & 1),
+                0x8 => FaultUnit::gCDM_CS_KS2_SP((unit_code >> 4) & 1),
+                0x9 => FaultUnit::gCDM_KS0_SP((unit_code >> 4) & 1),
+                0xa => FaultUnit::gCDM_KS1_SP((unit_code >> 4) & 1),
+                0xb => FaultUnit::gCDM_KS2_SP((unit_code >> 4) & 1),
+                0xc => FaultUnit::gIPP_SP((unit_code >> 4) & 1),
+                0xd => FaultUnit::gIPP_CS_SP((unit_code >> 4) & 1),
+                0xe => FaultUnit::gRDE0_SP((unit_code >> 4) & 1),
+                0xf => FaultUnit::gRDE1_SP((unit_code >> 4) & 1),
+                _ => FaultUnit::Unknown(unit_code),
+            },
+            _ => FaultUnit::Unknown(unit_code),
+        };
+
+        let reason = match (fault_info >> 1) & 0x7 {
+            0 => FaultReason::Unmapped,
+            1 => FaultReason::AfFault,
+            2 => FaultReason::WriteOnly,
+            3 => FaultReason::ReadOnly,
+            4 => FaultReason::NoAccess,
+            a => FaultReason::Unknown(a as u8),
+        };
+
+        Some(FaultInfo {
+            address: (fault_info >> 30) << 6,
+            sideband: ((fault_info >> 23) & 0x7f) as u8,
+            vm_slot: ((fault_info >> 17) & 0x3f) as u32,
+            unit_code,
+            unit,
+            level: ((fault_info >> 7) & 3) as u8,
+            unk_5: ((fault_info >> 5) & 3) as u8,
+            read: (fault_info & (1 << 4)) != 0,
+            reason,
+        })
+    }
+}
diff --git a/drivers/gpu/drm/asahi/slotalloc.rs b/drivers/gpu/drm/asahi/slotalloc.rs
new file mode 100644
index 000000000000..6493111643fe
--- /dev/null
+++ b/drivers/gpu/drm/asahi/slotalloc.rs
@@ -0,0 +1,292 @@
+// SPDX-License-Identifier: GPL-2.0-only OR MIT
+
+//! Generic slot allocator
+//!
+//! This is a simple allocator to manage fixed-size pools of GPU resources that are transiently
+//! required during command execution. Each item resides in a "slot" at a given index. Users borrow
+//! and return free items from the available pool.
+//!
+//! Allocations are "sticky", and return a token that callers can use to request the same slot
+//! again later. This allows slots to be lazily invalidated, so that multiple uses by the same user
+//! avoid any actual cleanup work.
+//!
+//! The allocation policy is currently a simple LRU mechanism, doing a full linear scan over the
+//! slots when no token was previously provided. This is probably good enough, since in the absence
+//! of serious system contention most allocation requests will be immediately fulfilled from the
+//! previous slot without doing an LRU scan.
+
+use core::ops::{Deref, DerefMut};
+use kernel::{
+    error::{code::*, Result},
+    prelude::*,
+    sync::{Arc, CondVar, Mutex, UniqueArc},
+};
+
+/// Trait representing a single item within a slot.
+pub(crate) trait SlotItem {
+    /// Arbitrary user data associated with the SlotAllocator.
+    type Data;
+
+    /// Called eagerly when this item is released back into the available pool.
+    fn release(&mut self, _data: &mut Self::Data, _slot: u32) {}
+}
+
+/// Trivial implementation for users which do not require any slot data nor any allocator data.
+impl SlotItem for () {
+    type Data = ();
+}
+
+/// Represents a current or previous allocation of an item from a slot. Users keep `SlotToken`s
+/// around across allocations to request that, if possible, the same slot be reused.
+#[derive(Copy, Clone, Debug)]
+pub(crate) struct SlotToken {
+    time: u64,
+    slot: u32,
+}
+
+impl SlotToken {
+    /// Returns the slot index that this token represents a past assignment to.
+    pub(crate) fn last_slot(&self) -> u32 {
+        self.slot
+    }
+}
+
+/// A guard representing active ownership of a slot.
+pub(crate) struct Guard<T: SlotItem> {
+    item: Option<T>,
+    changed: bool,
+    token: SlotToken,
+    alloc: Arc<SlotAllocatorOuter<T>>,
+}
+
+impl<T: SlotItem> Guard<T> {
+    /// Returns the active slot owned by this `Guard`.
+    pub(crate) fn slot(&self) -> u32 {
+        self.token.slot
+    }
+
+    /// Returns `true` if the slot changed since the last allocation (or no `SlotToken` was
+    /// provided), or `false` if the previously allocated slot was successfully re-acquired with
+    /// no other users in the interim.
+    pub(crate) fn changed(&self) -> bool {
+        self.changed
+    }
+
+    /// Returns a `SlotToken` that can be used to re-request the same slot at a later time, after
+    /// this `Guard` is dropped.
+    pub(crate) fn token(&self) -> SlotToken {
+        self.token
+    }
+}
+
+impl<T: SlotItem> Deref for Guard<T> {
+    type Target = T;
+
+    fn deref(&self) -> &Self::Target {
+        self.item.as_ref().expect("SlotItem Guard lost our item!")
+    }
+}
+
+impl<T: SlotItem> DerefMut for Guard<T> {
+    fn deref_mut(&mut self) -> &mut Self::Target {
+        self.item.as_mut().expect("SlotItem Guard lost our item!")
+    }
+}
+
+/// A slot item that is currently free.
+struct Entry<T: SlotItem> {
+    item: T,
+    get_time: u64,
+    drop_time: u64,
+}
+
+/// Inner data for the `SlotAllocator`, protected by a `Mutex`.
+struct SlotAllocatorInner<T: SlotItem> {
+    data: T::Data,
+    slots: Vec<Option<Entry<T>>>,
+    get_count: u64,
+    drop_count: u64,
+}
+
+/// A single slot allocator instance.
+struct SlotAllocatorOuter<T: SlotItem> {
+    inner: Mutex<SlotAllocatorInner<T>>,
+    cond: CondVar,
+}
+
+/// A shared reference to a slot allocator instance.
+pub(crate) struct SlotAllocator<T: SlotItem>(Arc<SlotAllocatorOuter<T>>);
+
+impl<T: SlotItem> SlotAllocator<T> {
+    /// Creates a new `SlotAllocator`, with a fixed number of slots and arbitrary associated data.
+    ///
+    /// The caller provides a constructor callback which takes a reference to the `T::Data` and
+    /// creates a single slot. This is called during construction to create all the initial
+    /// items, which then live the lifetime of the `SlotAllocator`.
+    pub(crate) fn new(
+        num_slots: u32,
+        mut data: T::Data,
+        mut constructor: impl FnMut(&mut T::Data, u32) -> T,
+    ) -> Result<SlotAllocator<T>> {
+        let mut slots = Vec::try_with_capacity(num_slots as usize)?;
+
+        for i in 0..num_slots {
+            slots
+                .try_push(Some(Entry {
+                    item: constructor(&mut data, i),
+                    get_time: 0,
+                    drop_time: 0,
+                }))
+                .expect("try_push() failed after reservation");
+        }
+
+        let inner = SlotAllocatorInner {
+            data,
+            slots,
+            get_count: 0,
+            drop_count: 0,
+        };
+
+        let mut alloc = Pin::from(UniqueArc::try_new(SlotAllocatorOuter {
+            // SAFETY: `condvar_init!` is called below.
+            cond: unsafe { CondVar::new() },
+            // SAFETY: `mutex_init!` is called below.
+            inner: unsafe { Mutex::new(inner) },
+        })?);
+
+        // SAFETY: `cond` is pinned when `alloc` is.
+        let pinned = unsafe { alloc.as_mut().map_unchecked_mut(|s| &mut s.cond) };
+        kernel::condvar_init!(pinned, "SlotAllocator::cond");
+
+        // SAFETY: `inner` is pinned when `alloc` is.
+        let pinned = unsafe { alloc.as_mut().map_unchecked_mut(|s| &mut s.inner) };
+        kernel::mutex_init!(pinned, "SlotAllocator::inner");
+
+        Ok(SlotAllocator(alloc.into()))
+    }
+
+    /// Calls a callback on the inner data associated with this allocator, taking the lock.
+    pub(crate) fn with_inner<RetVal>(&self, cb: impl FnOnce(&mut T::Data) -> RetVal) -> RetVal {
+        let mut inner = self.0.inner.lock();
+        cb(&mut inner.data)
+    }
+
+    /// Gets a fresh slot, optionally reusing a previous allocation if a `SlotToken` is provided.
+    ///
+    /// Blocks if no slots are free.
+    pub(crate) fn get(&self, token: Option<SlotToken>) -> Result<Guard<T>> {
+        self.get_inner(token, |_a, _b| Ok(()))
+    }
+
+    /// Gets a fresh slot, optionally reusing a previous allocation if a `SlotToken` is provided.
+    ///
+    /// Blocks if no slots are free.
+    ///
+    /// This version allows the caller to pass in a callback that gets a mutable reference to the
+    /// user data for the allocator and the freshly acquired slot, which is called before the
+    /// allocator lock is released. This can be used to perform bookkeeping associated with
+    /// specific slots (such as tracking their current owner).
+    pub(crate) fn get_inner(
+        &self,
+        token: Option<SlotToken>,
+        cb: impl FnOnce(&mut T::Data, &mut Guard<T>) -> Result<()>,
+    ) -> Result<Guard<T>> {
+        let mut inner = self.0.inner.lock();
+
+        if let Some(token) = token {
+            let slot = &mut inner.slots[token.slot as usize];
+            if slot.is_some() {
+                let count = slot.as_ref().unwrap().get_time;
+                if count == token.time {
+                    let mut guard = Guard {
+                        item: Some(slot.take().unwrap().item),
+                        token,
+                        changed: false,
+                        alloc: self.0.clone(),
+                    };
+                    cb(&mut inner.data, &mut guard)?;
+                    return Ok(guard);
+                }
+            }
+        }
+
+        let mut first = true;
+        let slot = loop {
+            let mut oldest_time = u64::MAX;
+            let mut oldest_slot = 0u32;
+
+            for (i, slot) in inner.slots.iter().enumerate() {
+                if let Some(slot) = slot.as_ref() {
+                    if slot.drop_time < oldest_time {
+                        oldest_slot = i as u32;
+                        oldest_time = slot.drop_time;
+                    }
+                }
+            }
+
+            if oldest_time == u64::MAX {
+                if first {
+                    pr_warn!(
+                        "{}: out of slots, blocking\n",
+                        core::any::type_name::<Self>()
+                    );
+                }
+                first = false;
+                if self.0.cond.wait(&mut inner) {
+                    return Err(ERESTARTSYS);
+                }
+            } else {
+                break oldest_slot;
+            }
+        };
+
+        inner.get_count += 1;
+
+        let item = inner.slots[slot as usize]
+            .take()
+            .expect("Someone stole our slot?")
+            .item;
+
+        let mut guard = Guard {
+            item: Some(item),
+            changed: true,
+            token: SlotToken {
+                time: inner.get_count,
+                slot,
+            },
+            alloc: self.0.clone(),
+        };
+
+        cb(&mut inner.data, &mut guard)?;
+        Ok(guard)
+    }
+}
+
+impl<T: SlotItem> Clone for SlotAllocator<T> {
+    fn clone(&self) -> Self {
+        SlotAllocator(self.0.clone())
+    }
+}
+
+impl<T: SlotItem> Drop for Guard<T> {
+    fn drop(&mut self) {
+        let mut inner = self.alloc.inner.lock();
+        if inner.slots[self.token.slot as usize].is_some() {
+            pr_crit!(
+                "{}: tried to return an item into a full slot ({})\n",
+                core::any::type_name::<Self>(),
+                self.token.slot
+            );
+        } else {
+            inner.drop_count += 1;
+            let mut item = self.item.take().expect("Guard lost its item");
+            item.release(&mut inner.data, self.token.slot);
+            inner.slots[self.token.slot as usize] = Some(Entry {
+                item,
+                get_time: self.token.time,
+                drop_time: inner.drop_count,
+            });
+            self.alloc.cond.notify_one();
+        }
+    }
+}
diff --git a/drivers/gpu/drm/asahi/util.rs b/drivers/gpu/drm/asahi/util.rs
new file mode 100644
index 000000000000..8d1a37f17cd8
--- /dev/null
+++ b/drivers/gpu/drm/asahi/util.rs
@@ -0,0 +1,44 @@
+// SPDX-License-Identifier: GPL-2.0-only OR MIT
+
+//! Miscellaneous utility functions
+
+use core::ops::{Add, BitAnd, Div, Not, Sub};
+
+/// Aligns an integer type to a power of two.
+pub(crate) fn align<T>(a: T, b: T) -> T
+where
+    T: Copy
+        + Default
+        + BitAnd<Output = T>
+        + Not<Output = T>
+        + Add<Output = T>
+        + Sub<Output = T>
+        + Div<Output = T>
+        + core::cmp::PartialEq,
+{
+    let def: T = Default::default();
+    #[allow(clippy::eq_op)]
+    let one: T = !def / !def;
+
+    assert!((b & (b - one)) == def);
+
+    (a + b - one) & !(b - one)
+}
+
+/// Integer division rounding up.
+pub(crate) fn div_ceil<T>(a: T, b: T) -> T
+where
+    T: Copy
+        + Default
+        + BitAnd<Output = T>
+        + Not<Output = T>
+        + Add<Output = T>
+        + Sub<Output = T>
+        + Div<Output = T>,
+{
+    let def: T = Default::default();
+    #[allow(clippy::eq_op)]
+    let one: T = !def / !def;
+
+    (a + b - one) / b
+}
diff --git a/drivers/gpu/drm/asahi/workqueue.rs b/drivers/gpu/drm/asahi/workqueue.rs
new file mode 100644
index 000000000000..ce1d1f89e48e
--- /dev/null
+++ b/drivers/gpu/drm/asahi/workqueue.rs
@@ -0,0 +1,880 @@
+// SPDX-License-Identifier: GPL-2.0-only OR MIT
+
+//! GPU command execution queues
+//!
+//! The AGX GPU firmware schedules GPU work commands out of work queues, which are ring buffers of
+//! pointers to work commands. There can be an arbitrary number of work queues. Work queues have an
+//! associated type (vertex, fragment, or compute) and may only contain generic commands or commands
+//! specific to that type.
+//!
+//! This module manages queueing work commands into a work queue and submitting them for execution
+//! by the firmware. An active work queue needs an event to signal completion of its work, which is
+//! owned by what we call a batch. This event then notifies the work queue when work is completed,
+//! and that triggers freeing of all resources associated with that work. An idle work queue gives
+//! up its associated event.
+
+use crate::debug::*;
+use crate::fw::channels::PipeType;
+use crate::fw::types::*;
+use crate::fw::workqueue::*;
+use crate::object::OpaqueGpuObject;
+use crate::regs::FaultReason;
+use crate::{box_in_place, no_debug, place};
+use crate::{channel, driver, event, fw, gpu, object, regs};
+use core::num::NonZeroU64;
+use core::sync::atomic::Ordering;
+use kernel::{
+    bindings,
+    error::code::*,
+    prelude::*,
+    sync::{Arc, Guard, Mutex, UniqueArc},
+};
+
+const DEBUG_CLASS: DebugFlags = DebugFlags::WorkQueue;
+
+const MAX_JOB_SLOTS: u32 = 127;
+
+/// An enum of possible errors that might cause a piece of work to fail execution.
+#[derive(Copy, Clone, Debug, PartialEq, Eq)]
+pub(crate) enum WorkError {
+    /// GPU timeout (command execution took too long).
+    Timeout,
+    /// GPU MMU fault (invalid access).
+    Fault(regs::FaultInfo),
+    /// Work failed due to an error caused by other concurrent GPU work.
+    Killed,
+    /// The GPU crashed.
+    NoDevice,
+    /// Unknown reason.
+    Unknown,
+}
+
+impl From<WorkError> for bindings::drm_asahi_result_info {
+    fn from(err: WorkError) -> Self {
+        match err {
+            WorkError::Fault(info) => Self {
+                status: bindings::drm_asahi_status_DRM_ASAHI_STATUS_FAULT,
+                fault_type: match info.reason {
+                    FaultReason::Unmapped => bindings::drm_asahi_fault_DRM_ASAHI_FAULT_UNMAPPED,
+                    FaultReason::AfFault => bindings::drm_asahi_fault_DRM_ASAHI_FAULT_AF_FAULT,
+                    FaultReason::WriteOnly => bindings::drm_asahi_fault_DRM_ASAHI_FAULT_WRITE_ONLY,
+                    FaultReason::ReadOnly => bindings::drm_asahi_fault_DRM_ASAHI_FAULT_READ_ONLY,
+                    FaultReason::NoAccess => bindings::drm_asahi_fault_DRM_ASAHI_FAULT_NO_ACCESS,
+                    FaultReason::Unknown(_) => bindings::drm_asahi_fault_DRM_ASAHI_FAULT_UNKNOWN,
+                },
+                unit: info.unit_code.into(),
+                sideband: info.sideband.into(),
+                level: info.level,
+                extra: info.unk_5.into(),
+                is_read: info.read as u8,
+                pad: 0,
+                address: info.address,
+            },
+            a => Self {
+                status: match a {
+                    WorkError::Timeout => bindings::drm_asahi_status_DRM_ASAHI_STATUS_TIMEOUT,
+                    WorkError::Killed => bindings::drm_asahi_status_DRM_ASAHI_STATUS_KILLED,
+                    WorkError::NoDevice => bindings::drm_asahi_status_DRM_ASAHI_STATUS_NO_DEVICE,
+                    _ => bindings::drm_asahi_status_DRM_ASAHI_STATUS_UNKNOWN_ERROR,
+                },
+                ..Default::default()
+            },
+        }
+    }
+}
+
+impl From<WorkError> for kernel::error::Error {
+    fn from(err: WorkError) -> Self {
+        match err {
+            WorkError::Timeout => ETIMEDOUT,
+            // Not EFAULT because that's for userspace faults
+            WorkError::Fault(_) => EIO,
+            WorkError::Unknown => ENODATA,
+            WorkError::Killed => ECANCELED,
+            WorkError::NoDevice => ENODEV,
+        }
+    }
+}
+
+/// A GPU context tracking structure, which must be explicitly invalidated when dropped.
+pub(crate) struct GpuContext {
+    dev: driver::AsahiDevice,
+    data: GpuObject<fw::workqueue::GpuContextData>,
+}
+no_debug!(GpuContext);
+
+impl GpuContext {
+    /// Allocate a new GPU context.
+    pub(crate) fn new(
+        dev: &driver::AsahiDevice,
+        alloc: &mut gpu::KernelAllocators,
+    ) -> Result<GpuContext> {
+        Ok(GpuContext {
+            dev: dev.clone(),
+            data: alloc
+                .shared
+                .new_object(Default::default(), |_inner| Default::default())?,
+        })
+    }
+
+    /// Returns the GPU pointer to the inner GPU context data structure.
+    pub(crate) fn gpu_pointer(&self) -> GpuPointer<'_, fw::workqueue::GpuContextData> {
+        self.data.gpu_pointer()
+    }
+}
+
+impl Drop for GpuContext {
+    fn drop(&mut self) {
+        mod_dev_dbg!(self.dev, "GpuContext: Invalidating GPU context\n");
+        let dev = self.dev.data();
+        if dev.gpu.invalidate_context(&self.data).is_err() {
+            dev_err!(self.dev, "GpuContext: Failed to invalidate GPU context!\n");
+        }
+    }
+}
+
+struct SubmittedWork<O, C>
+where
+    O: OpaqueGpuObject,
+    C: FnOnce(O, Option<WorkError>) + Send + Sync + 'static,
+{
+    object: O,
+    value: EventValue,
+    error: Option<WorkError>,
+    wptr: u32,
+    vm_slot: u32,
+    callback: C,
+}
+
+trait GenSubmittedWork: Send + Sync {
+    fn gpu_va(&self) -> NonZeroU64;
+    fn value(&self) -> event::EventValue;
+    fn wptr(&self) -> u32;
+    fn set_wptr(&mut self, wptr: u32);
+    fn mark_error(&mut self, error: WorkError);
+    fn complete(self: Box<Self>);
+}
+
+impl<O: OpaqueGpuObject, C: FnOnce(O, Option<WorkError>) + Send + Sync> GenSubmittedWork
+    for SubmittedWork<O, C>
+{
+    fn gpu_va(&self) -> NonZeroU64 {
+        self.object.gpu_va()
+    }
+
+    fn value(&self) -> event::EventValue {
+        self.value
+    }
+
+    fn wptr(&self) -> u32 {
+        self.wptr
+    }
+
+    fn set_wptr(&mut self, wptr: u32) {
+        self.wptr = wptr;
+    }
+
+    fn complete(self: Box<Self>) {
+        let SubmittedWork {
+            object,
+            value: _,
+            error,
+            wptr: _,
+            vm_slot: _,
+            callback,
+        } = *self;
+
+        callback(object, error);
+    }
+
+    fn mark_error(&mut self, error: WorkError) {
+        mod_pr_debug!("WorkQueue: Command at value {:#x?} failed\n", self.value);
+        self.error = Some(match error {
+            WorkError::Fault(info) if info.vm_slot != self.vm_slot => WorkError::Killed,
+            err => err,
+        });
+    }
+}
+
+/// Inner data for managing a single work queue.
+#[versions(AGX)]
+struct WorkQueueInner {
+    event_manager: Arc<event::EventManager>,
+    info: GpuObject<QueueInfo::ver>,
+    new: bool,
+    pipe_type: PipeType,
+    size: u32,
+    wptr: u32,
+    pending: Vec<Box<dyn GenSubmittedWork>>,
+    last_token: Option<event::Token>,
+    pending_jobs: usize,
+    last_submitted: Option<event::EventValue>,
+    last_completed: Option<event::EventValue>,
+    event: Option<(event::Event, event::EventValue)>,
+    priority: u32,
+    commit_seq: u64,
+    submit_seq: u64,
+}
+
+/// An instance of a work queue.
+#[versions(AGX)]
+pub(crate) struct WorkQueue {
+    info_pointer: GpuWeakPointer<QueueInfo::ver>,
+    inner: Mutex<WorkQueueInner::ver>,
+}
+
+#[versions(AGX)]
+impl WorkQueueInner::ver {
+    /// Return the GPU done pointer, representing how many work items have been completed by the
+    /// GPU.
+    fn doneptr(&self) -> u32 {
+        self.info
+            .state
+            .with(|raw, _inner| raw.gpu_doneptr.load(Ordering::Acquire))
+    }
+}
+
+#[versions(AGX)]
+#[derive(Copy, Clone)]
+pub(crate) struct QueueEventInfo {
+    pub(crate) stamp_pointer: GpuWeakPointer<Stamp>,
+    pub(crate) fw_stamp_pointer: GpuWeakPointer<FwStamp>,
+    pub(crate) slot: u32,
+    pub(crate) value: event::EventValue,
+    pub(crate) cmd_seq: u64,
+    pub(crate) info_ptr: GpuWeakPointer<QueueInfo::ver>,
+}
+
+#[versions(AGX)]
+pub(crate) struct Job {
+    wq: Arc<WorkQueue::ver>,
+    event_info: QueueEventInfo::ver,
+    start_value: EventValue,
+    pending: Vec<Box<dyn GenSubmittedWork>>,
+    committed: bool,
+    submitted: bool,
+    event_count: usize,
+}
+
+#[versions(AGX)]
+pub(crate) struct JobSubmission<'a> {
+    inner: Option<Guard<'a, Mutex<WorkQueueInner::ver>>>,
+    wptr: u32,
+    event_count: usize,
+    command_count: usize,
+}
+
+#[versions(AGX)]
+impl Job::ver {
+    pub(crate) fn event_info(&self) -> QueueEventInfo::ver {
+        let mut info = self.event_info;
+        info.cmd_seq += self.event_count as u64;
+
+        info
+    }
+
+    pub(crate) fn next_seq(&mut self) {
+        self.event_count += 1;
+        self.event_info.value.increment();
+    }
+
+    pub(crate) fn add<O: object::OpaqueGpuObject + 'static>(
+        &mut self,
+        command: O,
+        vm_slot: u32,
+    ) -> Result {
+        self.add_cb(command, vm_slot, |_, _| {})
+    }
+
+    pub(crate) fn add_cb<O: object::OpaqueGpuObject + 'static>(
+        &mut self,
+        command: O,
+        vm_slot: u32,
+        callback: impl FnOnce(O, Option<WorkError>) + Sync + Send + 'static,
+    ) -> Result {
+        if self.committed {
+            pr_err!("WorkQueue: Tried to mutate committed Job\n");
+            return Err(EINVAL);
+        }
+
+        self.pending.try_push(Box::try_new(SubmittedWork::<_, _> {
+            object: command,
+            value: self.event_info.value.next(),
+            error: None,
+            callback,
+            wptr: 0,
+            vm_slot,
+        })?)?;
+
+        Ok(())
+    }
+
+    pub(crate) fn commit(&mut self) -> Result {
+        if self.committed {
+            pr_err!("WorkQueue: Tried to commit committed Job\n");
+            return Err(EINVAL);
+        }
+
+        if self.pending.is_empty() {
+            pr_err!("WorkQueue: Job::commit() with no commands\n");
+            return Err(EINVAL);
+        }
+
+        let mut inner = self.wq.inner.lock();
+
+        let ev = inner.event.as_mut().expect("WorkQueue: Job lost its event");
+
+        if ev.1 != self.start_value {
+            pr_err!(
+                "WorkQueue: Job::commit() out of order (event slot {} {:?} != {:?}\n",
+                ev.0.slot(),
+                ev.1,
+                self.start_value
+            );
+            return Err(EINVAL);
+        }
+
+        ev.1 = self.event_info.value;
+        inner.commit_seq += self.pending.len() as u64;
+        self.committed = true;
+
+        Ok(())
+    }
+
+    pub(crate) fn can_submit(&self) -> bool {
+        self.wq.free_slots() > self.event_count && self.wq.free_space() > self.pending.len()
+    }
+
+    pub(crate) fn submit(&mut self) -> Result<JobSubmission::ver<'_>> {
+        if !self.committed {
+            pr_err!("WorkQueue: Tried to submit uncommitted Job\n");
+            return Err(EINVAL);
+        }
+
+        if self.submitted {
+            pr_err!("WorkQueue: Tried to submit Job twice\n");
+            return Err(EINVAL);
+        }
+
+        if self.pending.is_empty() {
+            pr_err!("WorkQueue: Job::submit() with no commands\n");
+            return Err(EINVAL);
+        }
+
+        let mut inner = self.wq.inner.lock();
+
+        if inner.submit_seq != self.event_info.cmd_seq {
+            pr_err!(
+                "WorkQueue: Job::submit() out of order (submit_seq {} != {})\n",
+                inner.submit_seq,
+                self.event_info.cmd_seq
+            );
+            return Err(EINVAL);
+        }
+
+        if inner.commit_seq < (self.event_info.cmd_seq + self.pending.len() as u64) {
+            pr_err!(
+                "WorkQueue: Job::submit() out of order (commit_seq {} != {})\n",
+                inner.commit_seq,
+                (self.event_info.cmd_seq + self.pending.len() as u64)
+            );
+            return Err(EINVAL);
+        }
+
+        let mut wptr = inner.wptr;
+        let command_count = self.pending.len();
+
+        if inner.free_space() <= command_count {
+            pr_err!("WorkQueue: Job does not fit in ring buffer\n");
+            return Err(EBUSY);
+        }
+
+        inner.pending.try_reserve(command_count)?;
+
+        inner.last_submitted = inner.event.as_ref().map(|e| e.1);
+
+        for mut command in self.pending.drain(..) {
+            command.set_wptr(wptr);
+
+            let next_wptr = (wptr + 1) % inner.size;
+            assert!(inner.doneptr() != next_wptr);
+            inner.info.ring[wptr as usize] = command.gpu_va().get();
+            wptr = next_wptr;
+
+            // Cannot fail, since we did a try_reserve(1) above
+            inner
+                .pending
+                .try_push(command)
+                .expect("try_push() failed after try_reserve()");
+        }
+
+        self.submitted = true;
+
+        Ok(JobSubmission::ver {
+            inner: Some(inner),
+            wptr,
+            command_count,
+            event_count: self.event_count,
+        })
+    }
+}
+
+#[versions(AGX)]
+impl<'a> JobSubmission::ver<'a> {
+    pub(crate) fn run(mut self, channel: &mut channel::PipeChannel::ver) {
+        let command_count = self.command_count;
+        let mut inner = self.inner.take().expect("No inner?");
+        let wptr = self.wptr;
+        core::mem::forget(self);
+
+        inner
+            .info
+            .state
+            .with(|raw, _inner| raw.cpu_wptr.store(wptr, Ordering::Release));
+
+        inner.wptr = wptr;
+
+        let event = inner.event.as_mut().expect("JobSubmission lost its event");
+
+        let event_slot = event.0.slot();
+
+        let msg = fw::channels::RunWorkQueueMsg::ver {
+            pipe_type: inner.pipe_type,
+            work_queue: Some(inner.info.weak_pointer()),
+            wptr: inner.wptr,
+            event_slot,
+            is_new: inner.new,
+            __pad: Default::default(),
+        };
+        channel.send(&msg);
+        inner.new = false;
+
+        inner.submit_seq += command_count as u64;
+    }
+
+    pub(crate) fn pipe_type(&self) -> PipeType {
+        self.inner.as_ref().expect("No inner?").pipe_type
+    }
+
+    pub(crate) fn priority(&self) -> u32 {
+        self.inner.as_ref().expect("No inner?").priority
+    }
+}
+
+#[versions(AGX)]
+impl Drop for Job::ver {
+    fn drop(&mut self) {
+        mod_pr_debug!("WorkQueue: Dropping Job\n");
+        let mut inner = self.wq.inner.lock();
+
+        if self.committed && !self.submitted {
+            let pipe_type = inner.pipe_type;
+            let event = inner.event.as_mut().expect("Job lost its event");
+            mod_pr_debug!(
+                "WorkQueue({:?}): Roll back {} events (slot {} val {:#x?}) and {} commands\n",
+                pipe_type,
+                self.event_count,
+                event.0.slot(),
+                event.1,
+                self.pending.len()
+            );
+            event.1.sub(self.event_count as u32);
+            inner.commit_seq -= self.pending.len() as u64;
+        }
+
+        inner.pending_jobs -= 1;
+
+        if inner.pending.is_empty() && inner.pending_jobs == 0 {
+            mod_pr_debug!("WorkQueue({:?}): Dropping event\n", inner.pipe_type);
+            inner.event = None;
+            inner.last_submitted = None;
+            inner.last_completed = None;
+        }
+        mod_pr_debug!("WorkQueue({:?}): Dropped Job\n", inner.pipe_type);
+    }
+}
+
+#[versions(AGX)]
+impl<'a> Drop for JobSubmission::ver<'a> {
+    fn drop(&mut self) {
+        let inner = self.inner.as_mut().expect("No inner?");
+        mod_pr_debug!("WorkQueue({:?}): Dropping JobSubmission\n", inner.pipe_type);
+
+        let new_len = inner.pending.len() - self.command_count;
+        inner.pending.truncate(new_len);
+
+        let pipe_type = inner.pipe_type;
+        let event = inner.event.as_mut().expect("JobSubmission lost its event");
+        mod_pr_debug!(
+            "WorkQueue({:?}): Roll back {} events (slot {} val {:#x?}) and {} commands\n",
+            pipe_type,
+            self.event_count,
+            event.0.slot(),
+            event.1,
+            self.command_count
+        );
+        event.1.sub(self.event_count as u32);
+        inner.commit_seq -= self.command_count as u64;
+        mod_pr_debug!("WorkQueue({:?}): Dropped JobSubmission\n", inner.pipe_type);
+    }
+}
+
+#[versions(AGX)]
+impl WorkQueueInner::ver {
+    /// Return the number of free entries in the workqueue
+    pub(crate) fn free_space(&self) -> usize {
+        self.size as usize - self.pending.len() - 1
+    }
+
+    pub(crate) fn free_slots(&self) -> usize {
+        let busy_slots = if let Some(ls) = self.last_submitted {
+            let lc = self
+                .last_completed
+                .expect("last_submitted but not completed?");
+            ls.delta(&lc)
+        } else {
+            0
+        };
+
+        ((MAX_JOB_SLOTS as i32) - busy_slots).max(0) as usize
+    }
+}
+
+#[versions(AGX)]
+impl WorkQueue::ver {
+    /// Create a new WorkQueue of a given type and priority.
+    #[allow(clippy::too_many_arguments)]
+    pub(crate) fn new(
+        alloc: &mut gpu::KernelAllocators,
+        event_manager: Arc<event::EventManager>,
+        gpu_context: Arc<GpuContext>,
+        notifier_list: Arc<GpuObject<fw::event::NotifierList>>,
+        pipe_type: PipeType,
+        id: u64,
+        priority: u32,
+        size: u32,
+    ) -> Result<Arc<WorkQueue::ver>> {
+        let mut info = box_in_place!(QueueInfo::ver {
+            state: alloc.shared.new_default::<RingState>()?,
+            ring: alloc.shared.array_empty(size as usize)?,
+            gpu_buf: alloc.private.array_empty(0x2c18)?,
+            notifier_list: notifier_list,
+            gpu_context: gpu_context,
+        })?;
+
+        info.state.with_mut(|raw, _inner| {
+            raw.rb_size = size;
+        });
+
+        let inner = WorkQueueInner::ver {
+            event_manager,
+            info: alloc.private.new_boxed(info, |inner, ptr| {
+                Ok(place!(
+                    ptr,
+                    raw::QueueInfo::ver {
+                        state: inner.state.gpu_pointer(),
+                        ring: inner.ring.gpu_pointer(),
+                        notifier_list: inner.notifier_list.gpu_pointer(),
+                        gpu_buf: inner.gpu_buf.gpu_pointer(),
+                        gpu_rptr1: Default::default(),
+                        gpu_rptr2: Default::default(),
+                        gpu_rptr3: Default::default(),
+                        event_id: AtomicI32::new(-1),
+                        priority: *raw::PRIORITY.get(priority as usize).ok_or(EINVAL)?,
+                        unk_4c: -1,
+                        uuid: id as u32,
+                        unk_54: -1,
+                        unk_58: Default::default(),
+                        busy: Default::default(),
+                        __pad: Default::default(),
+                        unk_84_state: Default::default(),
+                        unk_88: 0,
+                        unk_8c: 0,
+                        unk_90: 0,
+                        unk_94: 0,
+                        pending: Default::default(),
+                        unk_9c: 0,
+                        #[ver(V >= V13_2)]
+                        unk_a0_0: 0,
+                        gpu_context: inner.gpu_context.gpu_pointer(),
+                        unk_a8: Default::default(),
+                        #[ver(V >= V13_2)]
+                        unk_b0: 0,
+                    }
+                ))
+            })?,
+            new: true,
+            pipe_type,
+            size,
+            wptr: 0,
+            pending: Vec::new(),
+            last_token: None,
+            event: None,
+            priority,
+            pending_jobs: 0,
+            commit_seq: 0,
+            submit_seq: 0,
+            last_completed: None,
+            last_submitted: None,
+        };
+
+        let mut queue = Pin::from(UniqueArc::try_new(Self {
+            info_pointer: inner.info.weak_pointer(),
+            // SAFETY: `mutex_init!` is called below.
+            inner: unsafe { Mutex::new(inner) },
+        })?);
+
+        // SAFETY: `inner` is pinned when `queue` is.
+        let pinned = unsafe { queue.as_mut().map_unchecked_mut(|s| &mut s.inner) };
+        match pipe_type {
+            PipeType::Vertex => kernel::mutex_init!(pinned, "WorkQueue::inner (Vertex)"),
+            PipeType::Fragment => kernel::mutex_init!(pinned, "WorkQueue::inner (Fragment)"),
+            PipeType::Compute => kernel::mutex_init!(pinned, "WorkQueue::inner (Compute)"),
+        }
+
+        Ok(queue.into())
+    }
+
+    pub(crate) fn event_info(&self) -> Option<QueueEventInfo::ver> {
+        let inner = self.inner.lock();
+
+        inner.event.as_ref().map(|ev| QueueEventInfo::ver {
+            stamp_pointer: ev.0.stamp_pointer(),
+            fw_stamp_pointer: ev.0.fw_stamp_pointer(),
+            slot: ev.0.slot(),
+            value: ev.1,
+            cmd_seq: inner.commit_seq,
+            info_ptr: self.info_pointer,
+        })
+    }
+
+    pub(crate) fn new_job(self: &Arc<Self>) -> Result<Job::ver> {
+        let mut inner = self.inner.lock();
+
+        if inner.event.is_none() {
+            mod_pr_debug!("WorkQueue({:?}): Grabbing event\n", inner.pipe_type);
+            let event = inner.event_manager.get(inner.last_token, self.clone())?;
+            let cur = event.current();
+            inner.last_token = Some(event.token());
+            mod_pr_debug!(
+                "WorkQueue({:?}): Grabbed event slot {}: {:#x?}\n",
+                inner.pipe_type,
+                event.slot(),
+                cur
+            );
+            inner.event = Some((event, cur));
+            inner.last_submitted = Some(cur);
+            inner.last_completed = Some(cur);
+        }
+
+        inner.pending_jobs += 1;
+
+        let ev = &inner.event.as_ref().unwrap();
+
+        mod_pr_debug!("WorkQueue({:?}): New job\n", inner.pipe_type);
+        Ok(Job::ver {
+            wq: self.clone(),
+            event_info: QueueEventInfo::ver {
+                stamp_pointer: ev.0.stamp_pointer(),
+                fw_stamp_pointer: ev.0.fw_stamp_pointer(),
+                slot: ev.0.slot(),
+                value: ev.1,
+                cmd_seq: inner.commit_seq,
+                info_ptr: self.info_pointer,
+            },
+            start_value: ev.1,
+            pending: Vec::new(),
+            event_count: 0,
+            committed: false,
+            submitted: false,
+        })
+    }
+
+    /// Return the number of free entries in the workqueue
+    pub(crate) fn free_space(&self) -> usize {
+        self.inner.lock().free_space()
+    }
+
+    /// Return the number of free job slots in the workqueue
+    pub(crate) fn free_slots(&self) -> usize {
+        self.inner.lock().free_slots()
+    }
+
+    pub(crate) fn pipe_type(&self) -> PipeType {
+        self.inner.lock().pipe_type
+    }
+}
+
+/// Trait used to erase the version-specific type of WorkQueues, to avoid leaking
+/// version-specificity into the event module.
+pub(crate) trait WorkQueue {
+    fn signal(&self) -> bool;
+    fn mark_error(&self, value: event::EventValue, error: WorkError);
+    fn fail_all(&self, error: WorkError);
+}
+
+#[versions(AGX)]
+impl WorkQueue for WorkQueue::ver {
+    /// Signal a workqueue that some work was completed.
+    ///
+    /// This will check the event stamp value to find out exactly how many commands were processed.
+    fn signal(&self) -> bool {
+        let mut inner = self.inner.lock();
+        let event = inner.event.as_ref();
+        let value = match event {
+            None => {
+                pr_err!("WorkQueue: signal() called but no event?\n");
+                return true;
+            }
+            Some(event) => event.0.current(),
+        };
+
+        inner.last_completed = Some(value);
+
+        mod_pr_debug!(
+            "WorkQueue({:?}): Signaling event {:?} value {:#x?}\n",
+            inner.pipe_type,
+            inner.last_token,
+            value
+        );
+
+        let mut completed_commands: usize = 0;
+
+        for cmd in inner.pending.iter() {
+            if cmd.value() <= value {
+                mod_pr_debug!(
+                    "WorkQueue({:?}): Command at value {:#x?} complete\n",
+                    inner.pipe_type,
+                    cmd.value()
+                );
+                completed_commands += 1;
+            } else {
+                break;
+            }
+        }
+
+        if completed_commands == 0 {
+            return inner.pending.is_empty();
+        }
+
+        let mut completed = Vec::new();
+
+        if completed.try_reserve(completed_commands).is_err() {
+            pr_crit!(
+                "WorkQueue({:?}): Failed to allocated space for {} completed commands\n",
+                inner.pipe_type,
+                completed_commands
+            );
+        }
+
+        let pipe_type = inner.pipe_type;
+
+        for cmd in inner.pending.drain(..completed_commands) {
+            if completed.try_push(cmd).is_err() {
+                pr_crit!(
+                    "WorkQueue({:?}): Failed to signal a completed command\n",
+                    pipe_type,
+                );
+            }
+        }
+
+        mod_pr_debug!(
+            "WorkQueue({:?}): Completed {} commands\n",
+            inner.pipe_type,
+            completed_commands
+        );
+
+        if let Some(i) = completed.last() {
+            inner
+                .info
+                .state
+                .with(|raw, _inner| raw.cpu_freeptr.store(i.wptr(), Ordering::Release));
+        }
+
+        let empty = inner.pending.is_empty();
+        if empty && inner.pending_jobs == 0 {
+            inner.event = None;
+            inner.last_submitted = None;
+            inner.last_completed = None;
+        }
+
+        core::mem::drop(inner);
+
+        for cmd in completed {
+            cmd.complete();
+        }
+
+        empty
+    }
+
+    /// Mark this queue's work up to a certain stamp value as having failed.
+    fn mark_error(&self, value: event::EventValue, error: WorkError) {
+        // If anything is marked completed, we can consider it successful
+        // at this point, even if we didn't get the signal event yet.
+        self.signal();
+
+        let mut inner = self.inner.lock();
+
+        if inner.event.is_none() {
+            pr_err!("WorkQueue: signal_fault() called but no event?\n");
+            return;
+        }
+
+        mod_pr_debug!(
+            "WorkQueue({:?}): Signaling fault for event {:?} at value {:#x?}\n",
+            inner.pipe_type,
+            inner.last_token,
+            value
+        );
+
+        for cmd in inner.pending.iter_mut() {
+            if cmd.value() <= value {
+                cmd.mark_error(error);
+            } else {
+                break;
+            }
+        }
+    }
+
+    /// Mark all of this queue's work as having failed, and complete it.
+    fn fail_all(&self, error: WorkError) {
+        // If anything is marked completed, we can consider it successful
+        // at this point, even if we didn't get the signal event yet.
+        self.signal();
+
+        let mut inner = self.inner.lock();
+
+        if inner.event.is_none() {
+            pr_err!("WorkQueue: fail_all() called but no event?\n");
+            return;
+        }
+
+        mod_pr_debug!(
+            "WorkQueue({:?}): Failing all jobs {:?}\n",
+            inner.pipe_type,
+            error
+        );
+
+        let mut cmds = Vec::new();
+
+        core::mem::swap(&mut inner.pending, &mut cmds);
+
+        if inner.pending_jobs == 0 {
+            inner.event = None;
+        }
+
+        core::mem::drop(inner);
+
+        for mut cmd in cmds {
+            cmd.mark_error(error);
+            cmd.complete();
+        }
+    }
+}
+
+#[versions(AGX)]
+impl Drop for WorkQueue::ver {
+    fn drop(&mut self) {
+        mod_pr_debug!("WorkQueue({:?}): Dropping\n", self.inner.lock().pipe_type);
+    }
+}

-- 
2.35.1