diff --git a/mesa.spec b/mesa.spec
index af182f9..b559ef5 100644
--- a/mesa.spec
+++ b/mesa.spec
@@ -20,7 +20,7 @@
 Summary: Mesa graphics libraries
 Name: mesa
 Version: 7.3
-Release: 9%{?dist}
+Release: 10%{?dist}
 License: MIT
 Group: System Environment/Libraries
 URL: http://www.mesa3d.org
@@ -427,6 +427,9 @@ rm -rf $RPM_BUILD_ROOT
 %{_libdir}/mesa-demos-data
 
 %changelog
+* Thu Mar 05 2009 Dave Airlie <airlied@redhat.com> 7.3-10
+- radeon-rewrite.patch: fixup link against libdrm_radeon
+
 * Wed Mar 04 2009 Dave Airlie <airlied@redhat.com> 7.3-9
 - try again: pull in 7.4 fixes, dri configs changes, new radeon-rewrite
 
diff --git a/radeon-rewrite.patch b/radeon-rewrite.patch
index 0edb095..1eb201c 100644
--- a/radeon-rewrite.patch
+++ b/radeon-rewrite.patch
@@ -1,716 +1,20182 @@
-From c4030c794274b22ba6ccb7c919900b41f5c723f2 Mon Sep 17 00:00:00 2001
-From: Dave Airlie <airlied@redhat.com>
-Date: Wed, 4 Mar 2009 16:51:14 +1000
-Subject: [PATCH] radeon/r100/r200: import latest merge
+commit 263b887d85e3eac9a32673c8ed3004c3129ce997
+Author: Dave Airlie <airlied@redhat.com>
+Date:   Sun Feb 15 17:03:47 2009 +1000
 
----
- src/mesa/drivers/dri/radeon/radeon_bo_drm.h        |  182 ++++
- src/mesa/drivers/dri/radeon/radeon_bo_legacy.c     |  825 +++++++++++++++++
- src/mesa/drivers/dri/radeon/radeon_bo_legacy.h     |   47 +
- src/mesa/drivers/dri/radeon/radeon_bocs_wrapper.h  |   67 ++
- src/mesa/drivers/dri/radeon/radeon_cmdbuf.h        |  143 +++
- src/mesa/drivers/dri/radeon/radeon_common.c        |  849 +++++++++++++++++
- src/mesa/drivers/dri/radeon/radeon_common.h        |   55 ++
- .../drivers/dri/radeon/radeon_common_context.c     |  589 ++++++++++++
- .../drivers/dri/radeon/radeon_common_context.h     |  508 ++++++++++
- src/mesa/drivers/dri/radeon/radeon_cs_drm.h        |  207 +++++
- src/mesa/drivers/dri/radeon/radeon_cs_legacy.c     |  504 ++++++++++
- src/mesa/drivers/dri/radeon/radeon_cs_legacy.h     |   40 +
- src/mesa/drivers/dri/radeon/radeon_dma.c           |  323 +++++++
- src/mesa/drivers/dri/radeon/radeon_dma.h           |   51 +
- src/mesa/drivers/dri/radeon/radeon_mipmap_tree.c   |  360 ++++++++
- src/mesa/drivers/dri/radeon/radeon_mipmap_tree.h   |   97 ++
- src/mesa/drivers/dri/radeon/radeon_texture.c       |  966 ++++++++++++++++++++
- src/mesa/drivers/dri/radeon/radeon_texture.h       |  118 +++
- 18 files changed, 5931 insertions(+), 0 deletions(-)
- create mode 100644 src/mesa/drivers/dri/radeon/radeon_bo_drm.h
- create mode 100644 src/mesa/drivers/dri/radeon/radeon_bo_legacy.c
- create mode 100644 src/mesa/drivers/dri/radeon/radeon_bo_legacy.h
- create mode 100644 src/mesa/drivers/dri/radeon/radeon_bocs_wrapper.h
- create mode 100644 src/mesa/drivers/dri/radeon/radeon_cmdbuf.h
- create mode 100644 src/mesa/drivers/dri/radeon/radeon_common.c
- create mode 100644 src/mesa/drivers/dri/radeon/radeon_common.h
- create mode 100644 src/mesa/drivers/dri/radeon/radeon_common_context.c
- create mode 100644 src/mesa/drivers/dri/radeon/radeon_common_context.h
- create mode 100644 src/mesa/drivers/dri/radeon/radeon_cs_drm.h
- create mode 100644 src/mesa/drivers/dri/radeon/radeon_cs_legacy.c
- create mode 100644 src/mesa/drivers/dri/radeon/radeon_cs_legacy.h
- create mode 100644 src/mesa/drivers/dri/radeon/radeon_dma.c
- create mode 100644 src/mesa/drivers/dri/radeon/radeon_dma.h
- create mode 100644 src/mesa/drivers/dri/radeon/radeon_mipmap_tree.c
- create mode 100644 src/mesa/drivers/dri/radeon/radeon_mipmap_tree.h
- create mode 100644 src/mesa/drivers/dri/radeon/radeon_texture.c
- create mode 100644 src/mesa/drivers/dri/radeon/radeon_texture.h
+    radeon: add cflags to decide whether to link libdrm_radeon or not.
+    
+    You don't need libdrm_radeon for the legacy driver to build,
+    only for the experimental mm/cs paths.
 
-diff --git a/src/mesa/drivers/dri/radeon/radeon_bo_drm.h b/src/mesa/drivers/dri/radeon/radeon_bo_drm.h
-new file mode 100644
-index 0000000..1ed13f1
---- /dev/null
-+++ b/src/mesa/drivers/dri/radeon/radeon_bo_drm.h
-@@ -0,0 +1,182 @@
-+/* 
-+ * Copyright © 2008 Jérôme Glisse
-+ * All Rights Reserved.
-+ * 
-+ * Permission is hereby granted, free of charge, to any person obtaining
-+ * a copy of this software and associated documentation files (the
-+ * "Software"), to deal in the Software without restriction, including
-+ * without limitation the rights to use, copy, modify, merge, publish,
-+ * distribute, sub license, and/or sell copies of the Software, and to
-+ * permit persons to whom the Software is furnished to do so, subject to
-+ * the following conditions:
-+ * 
-+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
-+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-+ * NON-INFRINGEMENT. IN NO EVENT SHALL THE COPYRIGHT HOLDERS, AUTHORS
-+ * AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
-+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE 
-+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
-+ *
-+ * The above copyright notice and this permission notice (including the
-+ * next paragraph) shall be included in all copies or substantial portions
-+ * of the Software.
-+ */
-+/*
-+ * Authors:
-+ *      Jérôme Glisse <glisse@freedesktop.org>
-+ */
-+#ifndef RADEON_BO_H
-+#define RADEON_BO_H
-+
-+#include <stdio.h>
-+#include <stdint.h>
-+//#include "radeon_track.h"
+commit 61e6b2aee3069700db397f26d7ae1384641367ff
+Author: Ian Romanick <idr@freedesktop.org>
+Date:   Fri Jan 9 15:43:17 2009 -0800
+
+    Track two sets of back-face stencil state
+    
+    Track separate back-face stencil state for OpenGL 2.0 /
+    GL_ATI_separate_stencil and GL_EXT_stencil_two_side.  This allows all
+    three to be enabled in a driver.  One set of state is set via the 2.0
+    or ATI functions and is used when STENCIL_TEST_TWO_SIDE_EXT is
+    disabled.  The other is set by StencilFunc and StencilOp when the
+    active stencil face is set to BACK.  The GL_EXT_stencil_two_side spec has
+    more details.
+    
+    http://opengl.org/registry/specs/EXT/stencil_two_side.txt
+
+commit 86691da4b5f43be625ec510b7fe40657b9985783
+Author: Dave Airlie <airlied@redhat.com>
+Date:   Wed Mar 4 16:51:14 2009 +1000
+
+    radeon/r100/r200: import latest merge
+diff --git a/configs/autoconf.in b/configs/autoconf.in
+index 4a89716..f18d119 100644
+--- a/configs/autoconf.in
++++ b/configs/autoconf.in
+@@ -20,6 +20,8 @@ CXXFLAGS = @CPPFLAGS@ @CXXFLAGS@ \
+ 	$(OPT_FLAGS) $(PIC_FLAGS) $(ARCH_FLAGS) $(DEFINES)
+ LDFLAGS = @LDFLAGS@
+ EXTRA_LIB_PATH = @EXTRA_LIB_PATH@
++RADEON_CFLAGS = @RADEON_CFLAGS@
++RADEON_LDFLAGS = @RADEON_LDFLAGS@
+ 
+ # Assembler
+ ASM_SOURCES = @ASM_SOURCES@
+diff --git a/configure.ac b/configure.ac
+index 73caf00..48f4eac 100644
+--- a/configure.ac
++++ b/configure.ac
+@@ -413,6 +413,8 @@ AC_SUBST([SRC_DIRS])
+ AC_SUBST([GLU_DIRS])
+ AC_SUBST([DRIVER_DIRS])
+ AC_SUBST([WINDOW_SYSTEM])
++AC_SUBST([RADEON_CFLAGS])
++AC_SUBST([RADEON_LDFLAGS])
+ 
+ dnl
+ dnl User supplied program configuration
+@@ -540,6 +542,13 @@ dri)
+     GL_PC_REQ_PRIV="libdrm >= $LIBDRM_REQUIRED dri2proto >= $DRI2PROTO_REQUIRED"
+     DRI_PC_REQ_PRIV="libdrm >= $LIBDRM_REQUIRED"
+ 
++    PKG_CHECK_MODULES([LIBDRM_RADEON], [libdrm_radeon], HAVE_LIBDRM_RADEON=yes, HAVE_LIBDRM_RADEON=no)
++
++    if test "$HAVE_LIBDRM_RADEON" = yes; then
++	RADEON_CFLAGS="-DHAVE_LIBDRM_RADEON=1 $LIBDRM_RADEON_CFLAGS"
++	RADEON_LDFLAGS=$LIBDRM_RADEON_LIBS
++    fi
++
+     # find the DRI deps for libGL
+     if test "$x11_pkgconfig" = yes; then
+         # add xcb modules if necessary
+diff --git a/src/mesa/drivers/dri/i965/brw_cc.c b/src/mesa/drivers/dri/i965/brw_cc.c
+index fa8121e..abae4b3 100644
+--- a/src/mesa/drivers/dri/i965/brw_cc.c
++++ b/src/mesa/drivers/dri/i965/brw_cc.c
+@@ -84,6 +84,7 @@ static void
+ cc_unit_populate_key(struct brw_context *brw, struct brw_cc_unit_key *key)
+ {
+    struct gl_stencil_attrib *stencil = brw->attribs.Stencil;
++   const unsigned back = stencil->_BackFace;
+ 
+    memset(key, 0, sizeof(*key));
+ 
+@@ -100,13 +101,13 @@ cc_unit_populate_key(struct brw_context *brw, struct brw_cc_unit_key *key)
+       key->stencil_test_mask[0] = stencil->ValueMask[0];
+    }
+    if (key->stencil_two_side) {
+-      key->stencil_func[1] = stencil->Function[1];
+-      key->stencil_fail_op[1] = stencil->FailFunc[1];
+-      key->stencil_pass_depth_fail_op[1] = stencil->ZFailFunc[1];
+-      key->stencil_pass_depth_pass_op[1] = stencil->ZPassFunc[1];
+-      key->stencil_ref[1] = stencil->Ref[1];
+-      key->stencil_write_mask[1] = stencil->WriteMask[1];
+-      key->stencil_test_mask[1] = stencil->ValueMask[1];
++      key->stencil_func[1] = stencil->Function[back];
++      key->stencil_fail_op[1] = stencil->FailFunc[back];
++      key->stencil_pass_depth_fail_op[1] = stencil->ZFailFunc[back];
++      key->stencil_pass_depth_pass_op[1] = stencil->ZPassFunc[back];
++      key->stencil_ref[1] = stencil->Ref[back];
++      key->stencil_write_mask[1] = stencil->WriteMask[back];
++      key->stencil_test_mask[1] = stencil->ValueMask[back];
+    }
+ 
+    if (brw->attribs.Color->_LogicOpEnabled)
+diff --git a/src/mesa/drivers/dri/i965/brw_wm.c b/src/mesa/drivers/dri/i965/brw_wm.c
+index c50b0d2..24149cf 100644
+--- a/src/mesa/drivers/dri/i965/brw_wm.c
++++ b/src/mesa/drivers/dri/i965/brw_wm.c
+@@ -189,8 +189,7 @@ static void brw_wm_populate_key( struct brw_context *brw,
+       lookup |= IZ_STENCIL_TEST_ENABLE_BIT;
+ 
+       if (brw->attribs.Stencil->WriteMask[0] ||
+-	  (brw->attribs.Stencil->_TestTwoSide &&
+-	   brw->attribs.Stencil->WriteMask[1]))
++	  brw->attribs.Stencil->WriteMask[brw->attribs.Stencil->_BackFace])
+ 	 lookup |= IZ_STENCIL_WRITE_ENABLE_BIT;
+    }
+ 
+diff --git a/src/mesa/drivers/dri/r200/Makefile b/src/mesa/drivers/dri/r200/Makefile
+index e9144ac..e593ed9 100644
+--- a/src/mesa/drivers/dri/r200/Makefile
++++ b/src/mesa/drivers/dri/r200/Makefile
+@@ -3,6 +3,8 @@
+ TOP = ../../../../..
+ include $(TOP)/configs/current
+ 
++CFLAGS += $(RADEON_CFLAGS)
++
+ LIBNAME = r200_dri.so
+ 
+ MINIGLX_SOURCES = server/radeon_dri.c 
+@@ -11,25 +13,35 @@ ifeq ($(USING_EGL), 1)
+ EGL_SOURCES = server/radeon_egl.c
+ endif
+ 
++RADEON_COMMON_SOURCES = \
++	radeon_texture.c \
++	radeon_common_context.c \
++	radeon_common.c \
++	radeon_dma.c \
++	radeon_lock.c \
++	radeon_bo_legacy.c \
++	radeon_cs_legacy.c \
++	radeon_mipmap_tree.c \
++	radeon_span.c
++
++
+ DRIVER_SOURCES = r200_context.c \
+ 		 r200_ioctl.c \
+-		 r200_lock.c \
+ 		 r200_state.c \
+ 		 r200_state_init.c \
+ 		 r200_cmdbuf.c \
+ 		 r200_pixel.c \
+ 		 r200_tex.c \
+-		 r200_texmem.c \
+ 		 r200_texstate.c \
+ 		 r200_tcl.c \
+ 		 r200_swtcl.c \
+-		 r200_span.c \
+ 		 r200_maos.c \
+ 		 r200_sanity.c \
+ 		 r200_fragshader.c \
+ 		 r200_vertprog.c \
+ 		 radeon_screen.c \
+-		 $(EGL_SOURCES)
++		 $(EGL_SOURCES) \
++		 $(RADEON_COMMON_SOURCES)
+ 
+ C_SOURCES = $(COMMON_SOURCES) $(DRIVER_SOURCES)
+ 
+@@ -48,7 +60,29 @@ SYMLINKS = \
+ COMMON_SYMLINKS = \
+ 	radeon_chipset.h \
+ 	radeon_screen.c \
+-	radeon_screen.h
++	radeon_screen.h \
++	radeon_bo_legacy.c \
++	radeon_cs_legacy.c \
++	radeon_bo_legacy.h \
++	radeon_cs_legacy.h \
++	radeon_bocs_wrapper.h \
++	radeon_span.h \
++	radeon_span.c \
++	radeon_lock.c \
++	radeon_lock.h \
++	radeon_common.c \
++	radeon_common_context.c \
++	radeon_common_context.h \
++	radeon_common.h \
++	radeon_cmdbuf.h \
++	radeon_mipmap_tree.c \
++	radeon_mipmap_tree.h \
++	radeon_texture.c \
++	radeon_texture.h \
++	radeon_dma.c \
++	radeon_dma.h
++
++DRI_LIB_DEPS += $(RADEON_LDFLAGS)
+ 
+ ##### TARGETS #####
+ 
+diff --git a/src/mesa/drivers/dri/r200/r200_cmdbuf.c b/src/mesa/drivers/dri/r200/r200_cmdbuf.c
+index e163377..ae31bcb 100644
+--- a/src/mesa/drivers/dri/r200/r200_cmdbuf.c
++++ b/src/mesa/drivers/dri/r200/r200_cmdbuf.c
+@@ -38,6 +38,7 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ #include "swrast/swrast.h"
+ #include "main/simple_list.h"
+ 
++#include "radeon_common.h"
+ #include "r200_context.h"
+ #include "r200_state.h"
+ #include "r200_ioctl.h"
+@@ -45,18 +46,6 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ #include "r200_sanity.h"
+ #include "radeon_reg.h"
+ 
+-static void print_state_atom( struct r200_state_atom *state )
+-{
+-   int i;
+-
+-   fprintf(stderr, "emit %s/%d\n", state->name, state->cmd_size);
+-
+-   if (0 & R200_DEBUG & DEBUG_VERBOSE) 
+-      for (i = 0 ; i < state->cmd_size ; i++) 
+-	 fprintf(stderr, "\t%s[%d]: %x\n", state->name, i, state->cmd[i]);
+-
+-}
+-
+ /* The state atoms will be emitted in the order they appear in the atom list,
+  * so this step is important.
+  */
+@@ -64,141 +53,56 @@ void r200SetUpAtomList( r200ContextPtr rmesa )
+ {
+    int i, mtu;
+ 
+-   mtu = rmesa->glCtx->Const.MaxTextureUnits;
+-
+-   make_empty_list(&rmesa->hw.atomlist);
+-   rmesa->hw.atomlist.name = "atom-list";
+-
+-   insert_at_tail( &rmesa->hw.atomlist, &rmesa->hw.ctx );
+-   insert_at_tail( &rmesa->hw.atomlist, &rmesa->hw.set );
+-   insert_at_tail( &rmesa->hw.atomlist, &rmesa->hw.lin );
+-   insert_at_tail( &rmesa->hw.atomlist, &rmesa->hw.msk );
+-   insert_at_tail( &rmesa->hw.atomlist, &rmesa->hw.vpt );
+-   insert_at_tail( &rmesa->hw.atomlist, &rmesa->hw.vtx );
+-   insert_at_tail( &rmesa->hw.atomlist, &rmesa->hw.vap );
+-   insert_at_tail( &rmesa->hw.atomlist, &rmesa->hw.vte );
+-   insert_at_tail( &rmesa->hw.atomlist, &rmesa->hw.msc );
+-   insert_at_tail( &rmesa->hw.atomlist, &rmesa->hw.cst );
+-   insert_at_tail( &rmesa->hw.atomlist, &rmesa->hw.zbs );
+-   insert_at_tail( &rmesa->hw.atomlist, &rmesa->hw.tcl );
+-   insert_at_tail( &rmesa->hw.atomlist, &rmesa->hw.msl );
+-   insert_at_tail( &rmesa->hw.atomlist, &rmesa->hw.tcg );
+-   insert_at_tail( &rmesa->hw.atomlist, &rmesa->hw.grd );
+-   insert_at_tail( &rmesa->hw.atomlist, &rmesa->hw.fog );
+-   insert_at_tail( &rmesa->hw.atomlist, &rmesa->hw.tam );
+-   insert_at_tail( &rmesa->hw.atomlist, &rmesa->hw.tf );
+-   insert_at_tail( &rmesa->hw.atomlist, &rmesa->hw.atf );
++   mtu = rmesa->radeon.glCtx->Const.MaxTextureUnits;
++
++   make_empty_list(&rmesa->radeon.hw.atomlist);
++   rmesa->radeon.hw.atomlist.name = "atom-list";
++
++   insert_at_tail( &rmesa->radeon.hw.atomlist, &rmesa->hw.ctx );
++   insert_at_tail( &rmesa->radeon.hw.atomlist, &rmesa->hw.set );
++   insert_at_tail( &rmesa->radeon.hw.atomlist, &rmesa->hw.lin );
++   insert_at_tail( &rmesa->radeon.hw.atomlist, &rmesa->hw.msk );
++   insert_at_tail( &rmesa->radeon.hw.atomlist, &rmesa->hw.vpt );
++   insert_at_tail( &rmesa->radeon.hw.atomlist, &rmesa->hw.vtx );
++   insert_at_tail( &rmesa->radeon.hw.atomlist, &rmesa->hw.vap );
++   insert_at_tail( &rmesa->radeon.hw.atomlist, &rmesa->hw.vte );
++   insert_at_tail( &rmesa->radeon.hw.atomlist, &rmesa->hw.msc );
++   insert_at_tail( &rmesa->radeon.hw.atomlist, &rmesa->hw.cst );
++   insert_at_tail( &rmesa->radeon.hw.atomlist, &rmesa->hw.zbs );
++   insert_at_tail( &rmesa->radeon.hw.atomlist, &rmesa->hw.tcl );
++   insert_at_tail( &rmesa->radeon.hw.atomlist, &rmesa->hw.msl );
++   insert_at_tail( &rmesa->radeon.hw.atomlist, &rmesa->hw.tcg );
++   insert_at_tail( &rmesa->radeon.hw.atomlist, &rmesa->hw.grd );
++   insert_at_tail( &rmesa->radeon.hw.atomlist, &rmesa->hw.fog );
++   insert_at_tail( &rmesa->radeon.hw.atomlist, &rmesa->hw.tam );
++   insert_at_tail( &rmesa->radeon.hw.atomlist, &rmesa->hw.tf );
++   insert_at_tail( &rmesa->radeon.hw.atomlist, &rmesa->hw.atf );
+    for (i = 0; i < mtu; ++i)
+-       insert_at_tail( &rmesa->hw.atomlist, &rmesa->hw.tex[i] );
++       insert_at_tail( &rmesa->radeon.hw.atomlist, &rmesa->hw.tex[i] );
+    for (i = 0; i < mtu; ++i)
+-       insert_at_tail( &rmesa->hw.atomlist, &rmesa->hw.cube[i] );
++       insert_at_tail( &rmesa->radeon.hw.atomlist, &rmesa->hw.cube[i] );
+    for (i = 0; i < 6; ++i)
+-       insert_at_tail( &rmesa->hw.atomlist, &rmesa->hw.pix[i] );
+-   insert_at_tail( &rmesa->hw.atomlist, &rmesa->hw.afs[0] );
+-   insert_at_tail( &rmesa->hw.atomlist, &rmesa->hw.afs[1] );
++       insert_at_tail( &rmesa->radeon.hw.atomlist, &rmesa->hw.pix[i] );
++   insert_at_tail( &rmesa->radeon.hw.atomlist, &rmesa->hw.afs[0] );
++   insert_at_tail( &rmesa->radeon.hw.atomlist, &rmesa->hw.afs[1] );
+    for (i = 0; i < 8; ++i)
+-       insert_at_tail( &rmesa->hw.atomlist, &rmesa->hw.lit[i] );
++       insert_at_tail( &rmesa->radeon.hw.atomlist, &rmesa->hw.lit[i] );
+    for (i = 0; i < 3 + mtu; ++i)
+-       insert_at_tail( &rmesa->hw.atomlist, &rmesa->hw.mat[i] );
+-   insert_at_tail( &rmesa->hw.atomlist, &rmesa->hw.eye );
+-   insert_at_tail( &rmesa->hw.atomlist, &rmesa->hw.glt );
++       insert_at_tail( &rmesa->radeon.hw.atomlist, &rmesa->hw.mat[i] );
++   insert_at_tail( &rmesa->radeon.hw.atomlist, &rmesa->hw.eye );
++   insert_at_tail( &rmesa->radeon.hw.atomlist, &rmesa->hw.glt );
+    for (i = 0; i < 2; ++i)
+-      insert_at_tail( &rmesa->hw.atomlist, &rmesa->hw.mtl[i] );
++      insert_at_tail( &rmesa->radeon.hw.atomlist, &rmesa->hw.mtl[i] );
+    for (i = 0; i < 6; ++i)
+-       insert_at_tail( &rmesa->hw.atomlist, &rmesa->hw.ucp[i] );
+-   insert_at_tail( &rmesa->hw.atomlist, &rmesa->hw.spr );
+-   insert_at_tail( &rmesa->hw.atomlist, &rmesa->hw.ptp );
+-   insert_at_tail( &rmesa->hw.atomlist, &rmesa->hw.prf );
+-   insert_at_tail( &rmesa->hw.atomlist, &rmesa->hw.pvs );
+-   insert_at_tail( &rmesa->hw.atomlist, &rmesa->hw.vpp[0] );
+-   insert_at_tail( &rmesa->hw.atomlist, &rmesa->hw.vpp[1] );
+-   insert_at_tail( &rmesa->hw.atomlist, &rmesa->hw.vpi[0] );
+-   insert_at_tail( &rmesa->hw.atomlist, &rmesa->hw.vpi[1] );
+-}
+-
+-static void r200SaveHwState( r200ContextPtr rmesa )
+-{
+-   struct r200_state_atom *atom;
+-   char * dest = rmesa->backup_store.cmd_buf;
+-
+-   if (R200_DEBUG & DEBUG_STATE)
+-      fprintf(stderr, "%s\n", __FUNCTION__);
+-
+-   rmesa->backup_store.cmd_used = 0;
+-
+-   foreach( atom, &rmesa->hw.atomlist ) {
+-      if ( atom->check( rmesa->glCtx, atom->idx ) ) {
+-	 int size = atom->cmd_size * 4;
+-	 memcpy( dest, atom->cmd, size);
+-	 dest += size;
+-	 rmesa->backup_store.cmd_used += size;
+-	 if (R200_DEBUG & DEBUG_STATE)
+-	    print_state_atom( atom );
+-      }
+-   }
+-
+-   assert( rmesa->backup_store.cmd_used <= R200_CMD_BUF_SZ );
+-   if (R200_DEBUG & DEBUG_STATE)
+-      fprintf(stderr, "Returning to r200EmitState\n");
+-}
+-
+-void r200EmitState( r200ContextPtr rmesa )
+-{
+-   char *dest;
+-   int mtu;
+-   struct r200_state_atom *atom;
+-
+-   if (R200_DEBUG & (DEBUG_STATE|DEBUG_PRIMS))
+-      fprintf(stderr, "%s\n", __FUNCTION__);
+-
+-   if (rmesa->save_on_next_emit) {
+-      r200SaveHwState(rmesa);
+-      rmesa->save_on_next_emit = GL_FALSE;
+-   }
+-
+-   if (!rmesa->hw.is_dirty && !rmesa->hw.all_dirty)
+-      return;
+-
+-   mtu = rmesa->glCtx->Const.MaxTextureUnits;
+-
+-   /* To avoid going across the entire set of states multiple times, just check
+-    * for enough space for the case of emitting all state, and inline the
+-    * r200AllocCmdBuf code here without all the checks.
+-    */
+-   r200EnsureCmdBufSpace( rmesa, rmesa->hw.max_state_size );
+-
+-   /* we need to calculate dest after EnsureCmdBufSpace
+-      as we may flush the buffer - airlied */
+-   dest = rmesa->store.cmd_buf + rmesa->store.cmd_used;
+-   if (R200_DEBUG & DEBUG_STATE) {
+-      foreach( atom, &rmesa->hw.atomlist ) {
+-	 if ( atom->dirty || rmesa->hw.all_dirty ) {
+-	    if ( atom->check( rmesa->glCtx, atom->idx ) )
+-	       print_state_atom( atom );
+-	    else
+-	       fprintf(stderr, "skip state %s\n", atom->name);
+-	 }
+-      }
+-   }
+-
+-   foreach( atom, &rmesa->hw.atomlist ) {
+-      if ( rmesa->hw.all_dirty )
+-	 atom->dirty = GL_TRUE;
+-      if ( atom->dirty ) {
+-	 if ( atom->check( rmesa->glCtx, atom->idx ) ) {
+-	    int size = atom->cmd_size * 4;
+-	    memcpy( dest, atom->cmd, size);
+-	    dest += size;
+-	    rmesa->store.cmd_used += size;
+-	    atom->dirty = GL_FALSE;
+-	 }
+-      }
+-   }
+-
+-   assert( rmesa->store.cmd_used <= R200_CMD_BUF_SZ );
+-
+-   rmesa->hw.is_dirty = GL_FALSE;
+-   rmesa->hw.all_dirty = GL_FALSE;
++       insert_at_tail( &rmesa->radeon.hw.atomlist, &rmesa->hw.ucp[i] );
++   insert_at_tail( &rmesa->radeon.hw.atomlist, &rmesa->hw.spr );
++   insert_at_tail( &rmesa->radeon.hw.atomlist, &rmesa->hw.ptp );
++   insert_at_tail( &rmesa->radeon.hw.atomlist, &rmesa->hw.prf );
++   insert_at_tail( &rmesa->radeon.hw.atomlist, &rmesa->hw.pvs );
++   insert_at_tail( &rmesa->radeon.hw.atomlist, &rmesa->hw.vpp[0] );
++   insert_at_tail( &rmesa->radeon.hw.atomlist, &rmesa->hw.vpp[1] );
++   insert_at_tail( &rmesa->radeon.hw.atomlist, &rmesa->hw.vpi[0] );
++   insert_at_tail( &rmesa->radeon.hw.atomlist, &rmesa->hw.vpi[1] );
+ }
+ 
+ /* Fire a section of the retained (indexed_verts) buffer as a regular
+@@ -209,50 +113,81 @@ void r200EmitVbufPrim( r200ContextPtr rmesa,
+                        GLuint vertex_nr )
+ {
+    drm_radeon_cmd_header_t *cmd;
++   BATCH_LOCALS(&rmesa->radeon);
+ 
+    assert(!(primitive & R200_VF_PRIM_WALK_IND));
+    
+-   r200EmitState( rmesa );
++   radeonEmitState(&rmesa->radeon);
+    
+    if (R200_DEBUG & (DEBUG_IOCTL|DEBUG_PRIMS))
+       fprintf(stderr, "%s cmd_used/4: %d prim %x nr %d\n", __FUNCTION__,
+ 	      rmesa->store.cmd_used/4, primitive, vertex_nr);
+-   
+-   cmd = (drm_radeon_cmd_header_t *)r200AllocCmdBuf( rmesa, VBUF_BUFSZ,
+-						  __FUNCTION__ );
+-   cmd[0].i = 0;
+-   cmd[0].header.cmd_type = RADEON_CMD_PACKET3_CLIP;
+-   cmd[1].i = R200_CP_CMD_3D_DRAW_VBUF_2;
+-   cmd[2].i = (primitive | 
+-	       R200_VF_PRIM_WALK_LIST |
+-	       R200_VF_COLOR_ORDER_RGBA |
+-	       (vertex_nr << R200_VF_VERTEX_NUMBER_SHIFT));
++ 
++   BEGIN_BATCH(3);
++   OUT_BATCH_PACKET3_CLIP(R200_CP_CMD_3D_DRAW_VBUF_2, 0);
++   OUT_BATCH(primitive | R200_VF_PRIM_WALK_LIST | R200_VF_COLOR_ORDER_RGBA |
++	     (vertex_nr << R200_VF_VERTEX_NUMBER_SHIFT));
++   END_BATCH();
+ }
+ 
++static void r200FireEB(r200ContextPtr rmesa, int vertex_count, int type)
++{
++	BATCH_LOCALS(&rmesa->radeon);
++
++	if (vertex_count > 0) {
++		BEGIN_BATCH(8+2);
++		OUT_BATCH_PACKET3(R200_CP_CMD_3D_DRAW_INDX_2, 0);
++		OUT_BATCH(R200_VF_PRIM_WALK_IND |
++			  ((vertex_count + 0) << 16) |
++			  type);
++		
++		if (!rmesa->radeon.radeonScreen->kernel_mm) {
++			OUT_BATCH_PACKET3(R200_CP_CMD_INDX_BUFFER, 2);
++			OUT_BATCH((0x80 << 24) | (0 << 16) | 0x810);
++			OUT_BATCH_RELOC(rmesa->tcl.elt_dma_offset,
++					rmesa->tcl.elt_dma_bo,
++					rmesa->tcl.elt_dma_offset,
++					RADEON_GEM_DOMAIN_GTT, 0, 0);
++			OUT_BATCH(vertex_count/2);
++		} else {
++			OUT_BATCH_PACKET3(R200_CP_CMD_INDX_BUFFER, 2);
++			OUT_BATCH((0x80 << 24) | (0 << 16) | 0x810);
++			OUT_BATCH(rmesa->tcl.elt_dma_offset);
++			OUT_BATCH(vertex_count/2);
++			radeon_cs_write_reloc(rmesa->radeon.cmdbuf.cs,
++					      rmesa->tcl.elt_dma_bo,
++					      RADEON_GEM_DOMAIN_GTT, 0, 0);
++		}
++		END_BATCH();
++	}
++}
+ 
+-void r200FlushElts( r200ContextPtr rmesa )
++void r200FlushElts(GLcontext *ctx)
+ {
+-   int *cmd = (int *)(rmesa->store.cmd_buf + rmesa->store.elts_start);
++  r200ContextPtr rmesa = R200_CONTEXT(ctx);
+    int dwords;
+-   int nr = (rmesa->store.cmd_used - (rmesa->store.elts_start + 12)) / 2;
++   int nr, elt_used = rmesa->tcl.elt_used;
+ 
+    if (R200_DEBUG & (DEBUG_IOCTL|DEBUG_PRIMS))
+-      fprintf(stderr, "%s\n", __FUNCTION__);
++     fprintf(stderr, "%s %x %d\n", __FUNCTION__, rmesa->tcl.hw_primitive, elt_used);
++
++   assert( rmesa->radeon.dma.flush == r200FlushElts );
++   rmesa->radeon.dma.flush = NULL;
++
++   elt_used = (elt_used + 2) & ~2;
+ 
+-   assert( rmesa->dma.flush == r200FlushElts );
+-   rmesa->dma.flush = NULL;
++   nr = elt_used / 2;
+ 
+-   /* Cope with odd number of elts:
+-    */
+-   rmesa->store.cmd_used = (rmesa->store.cmd_used + 2) & ~2;
+-   dwords = (rmesa->store.cmd_used - rmesa->store.elts_start) / 4;
++   radeon_bo_unmap(rmesa->tcl.elt_dma_bo);
+ 
+-   cmd[1] |= (dwords - 3) << 16;
+-   cmd[2] |= nr << R200_VF_VERTEX_NUMBER_SHIFT;
++   r200FireEB(rmesa, nr, rmesa->tcl.hw_primitive);
++
++   radeon_bo_unref(rmesa->tcl.elt_dma_bo);
++   rmesa->tcl.elt_dma_bo = NULL;
+ 
+    if (R200_DEBUG & DEBUG_SYNC) {
+       fprintf(stderr, "%s: Syncing\n", __FUNCTION__);
+-      r200Finish( rmesa->glCtx );
++      radeonFinish( rmesa->radeon.glCtx );
+    }
+ }
+ 
+@@ -261,7 +196,6 @@ GLushort *r200AllocEltsOpenEnded( r200ContextPtr rmesa,
+ 				    GLuint primitive,
+ 				    GLuint min_nr )
+ {
+-   drm_radeon_cmd_header_t *cmd;
+    GLushort *retval;
+ 
+    if (R200_DEBUG & DEBUG_IOCTL)
+@@ -269,30 +203,25 @@ GLushort *r200AllocEltsOpenEnded( r200ContextPtr rmesa,
+ 
+    assert((primitive & R200_VF_PRIM_WALK_IND));
+    
+-   r200EmitState( rmesa );
+-   
+-   cmd = (drm_radeon_cmd_header_t *)r200AllocCmdBuf( rmesa, ELTS_BUFSZ(min_nr),
+-						__FUNCTION__ );
+-   cmd[0].i = 0;
+-   cmd[0].header.cmd_type = RADEON_CMD_PACKET3_CLIP;
+-   cmd[1].i = R200_CP_CMD_3D_DRAW_INDX_2;
+-   cmd[2].i = (primitive | 
+-	       R200_VF_PRIM_WALK_IND |
+-	       R200_VF_COLOR_ORDER_RGBA);
++   radeonEmitState(&rmesa->radeon);
+ 
++   rmesa->tcl.elt_dma_bo = radeon_bo_open(rmesa->radeon.radeonScreen->bom,
++					  0, R200_ELT_BUF_SZ, 4,
++					  RADEON_GEM_DOMAIN_GTT, 0);
++   rmesa->tcl.elt_dma_offset = 0;
++   rmesa->tcl.elt_used = min_nr * 2;
++
++   radeon_bo_map(rmesa->tcl.elt_dma_bo, 1);
++   retval = rmesa->tcl.elt_dma_bo->ptr + rmesa->tcl.elt_dma_offset;
+    
+-   retval = (GLushort *)(cmd+3);
+ 
+    if (R200_DEBUG & DEBUG_PRIMS)
+-      fprintf(stderr, "%s: header 0x%x prim %x \n",
+-	      __FUNCTION__,
+-	      cmd[1].i, primitive);
+-
+-   assert(!rmesa->dma.flush);
+-   rmesa->glCtx->Driver.NeedFlush |= FLUSH_STORED_VERTICES;
+-   rmesa->dma.flush = r200FlushElts;
++      fprintf(stderr, "%s: header prim %x \n",
++	      __FUNCTION__, primitive);
+ 
+-   rmesa->store.elts_start = ((char *)cmd) - rmesa->store.cmd_buf;
++   assert(!rmesa->radeon.dma.flush);
++   rmesa->radeon.glCtx->Driver.NeedFlush |= FLUSH_STORED_VERTICES;
++   rmesa->radeon.dma.flush = r200FlushElts;
+ 
+    return retval;
+ }
+@@ -300,129 +229,130 @@ GLushort *r200AllocEltsOpenEnded( r200ContextPtr rmesa,
+ 
+ 
+ void r200EmitVertexAOS( r200ContextPtr rmesa,
+-			  GLuint vertex_size,
+-			  GLuint offset )
++			GLuint vertex_size,
++ 			struct radeon_bo *bo,
++			GLuint offset )
+ {
+-   drm_radeon_cmd_header_t *cmd;
++   BATCH_LOCALS(&rmesa->radeon);
+ 
+    if (R200_DEBUG & (DEBUG_PRIMS|DEBUG_IOCTL))
+       fprintf(stderr, "%s:  vertex_size 0x%x offset 0x%x \n",
+ 	      __FUNCTION__, vertex_size, offset);
+ 
+-   cmd = (drm_radeon_cmd_header_t *)r200AllocCmdBuf( rmesa, VERT_AOS_BUFSZ,
+-						  __FUNCTION__ );
+ 
+-   cmd[0].header.cmd_type = RADEON_CMD_PACKET3;
+-   cmd[1].i = R200_CP_CMD_3D_LOAD_VBPNTR | (2 << 16);
+-   cmd[2].i = 1;
+-   cmd[3].i = vertex_size | (vertex_size << 8);
+-   cmd[4].i = offset;
++   BEGIN_BATCH(5);
++   OUT_BATCH_PACKET3(R200_CP_CMD_3D_LOAD_VBPNTR, 2);
++   OUT_BATCH(1);
++   OUT_BATCH(vertex_size | (vertex_size << 8));
++   OUT_BATCH_RELOC(offset, bo, offset, RADEON_GEM_DOMAIN_GTT, 0, 0);
++   END_BATCH();
+ }
+-		       
+ 
+-void r200EmitAOS( r200ContextPtr rmesa,
+-		    struct r200_dma_region **component,
+-		    GLuint nr,
+-		    GLuint offset )
++void r200EmitAOS(r200ContextPtr rmesa, GLuint nr, GLuint offset)
+ {
+-   drm_radeon_cmd_header_t *cmd;
+-   int sz = AOS_BUFSZ(nr);
++   BATCH_LOCALS(&rmesa->radeon);
++   uint32_t voffset;
++   int sz = 1 + (nr >> 1) * 3 + (nr & 1) * 2;
+    int i;
+-   int *tmp;
+-
+-   if (R200_DEBUG & DEBUG_IOCTL)
+-      fprintf(stderr, "%s nr arrays: %d\n", __FUNCTION__, nr);
+-
+-   cmd = (drm_radeon_cmd_header_t *)r200AllocCmdBuf( rmesa, sz, __FUNCTION__ );
+-   cmd[0].i = 0;
+-   cmd[0].header.cmd_type = RADEON_CMD_PACKET3;
+-   cmd[1].i = R200_CP_CMD_3D_LOAD_VBPNTR | (((sz / sizeof(int)) - 3) << 16);
+-   cmd[2].i = nr;
+-   tmp = &cmd[0].i;
+-   cmd += 3;
+-
+-   for (i = 0 ; i < nr ; i++) {
+-      if (i & 1) {
+-	 cmd[0].i |= ((component[i]->aos_stride << 24) | 
+-		      (component[i]->aos_size << 16));
+-	 cmd[2].i = (component[i]->aos_start + 
+-		     offset * component[i]->aos_stride * 4);
+-	 cmd += 3;
++   
++   if (RADEON_DEBUG & DEBUG_VERTS)
++      fprintf(stderr, "%s: nr=%d, ofs=0x%08x\n", __FUNCTION__, nr,
++	      offset);
 +
-+/* bo object */
-+#define RADEON_BO_FLAGS_MACRO_TILE  1
-+#define RADEON_BO_FLAGS_MICRO_TILE  2
++   BEGIN_BATCH(sz+2+ (nr*2));
++   OUT_BATCH_PACKET3(R200_CP_CMD_3D_LOAD_VBPNTR, sz - 1);
++   OUT_BATCH(nr);
 +
-+struct radeon_bo_manager;
++    
++   if (!rmesa->radeon.radeonScreen->kernel_mm) {
++      for (i = 0; i + 1 < nr; i += 2) {
++	 OUT_BATCH((rmesa->tcl.aos[i].components << 0) |
++		   (rmesa->tcl.aos[i].stride << 8) |
++		   (rmesa->tcl.aos[i + 1].components << 16) |
++		   (rmesa->tcl.aos[i + 1].stride << 24));
++			
++	 voffset =  rmesa->tcl.aos[i + 0].offset +
++	    offset * 4 * rmesa->tcl.aos[i + 0].stride;
++	 OUT_BATCH_RELOC(voffset,
++			 rmesa->tcl.aos[i].bo,
++			 voffset,
++			 RADEON_GEM_DOMAIN_GTT,
++			 0, 0);
++	 voffset =  rmesa->tcl.aos[i + 1].offset +
++	    offset * 4 * rmesa->tcl.aos[i + 1].stride;
++	 OUT_BATCH_RELOC(voffset,
++			 rmesa->tcl.aos[i+1].bo,
++			 voffset,
++			 RADEON_GEM_DOMAIN_GTT,
++			 0, 0);
+       }
+-      else {
+-	 cmd[0].i = ((component[i]->aos_stride << 8) | 
+-		     (component[i]->aos_size << 0));
+-	 cmd[1].i = (component[i]->aos_start + 
+-		     offset * component[i]->aos_stride * 4);
++      
++      if (nr & 1) {
++	 OUT_BATCH((rmesa->tcl.aos[nr - 1].components << 0) |
++		   (rmesa->tcl.aos[nr - 1].stride << 8));
++	 voffset =  rmesa->tcl.aos[nr - 1].offset +
++	    offset * 4 * rmesa->tcl.aos[nr - 1].stride;
++	 OUT_BATCH_RELOC(voffset,
++			 rmesa->tcl.aos[nr - 1].bo,
++			 voffset,
++			 RADEON_GEM_DOMAIN_GTT,
++			 0, 0);
++      }
++   } else {
++      for (i = 0; i + 1 < nr; i += 2) {
++	 OUT_BATCH((rmesa->tcl.aos[i].components << 0) |
++		   (rmesa->tcl.aos[i].stride << 8) |
++		   (rmesa->tcl.aos[i + 1].components << 16) |
++		   (rmesa->tcl.aos[i + 1].stride << 24));
++	 
++	 voffset =  rmesa->tcl.aos[i + 0].offset +
++	    offset * 4 * rmesa->tcl.aos[i + 0].stride;
++	 OUT_BATCH(voffset);
++	 voffset =  rmesa->tcl.aos[i + 1].offset +
++	    offset * 4 * rmesa->tcl.aos[i + 1].stride;
++	 OUT_BATCH(voffset);
++      }
++      
++      if (nr & 1) {
++	 OUT_BATCH((rmesa->tcl.aos[nr - 1].components << 0) |
++		   (rmesa->tcl.aos[nr - 1].stride << 8));
++	 voffset =  rmesa->tcl.aos[nr - 1].offset +
++	    offset * 4 * rmesa->tcl.aos[nr - 1].stride;
++	 OUT_BATCH(voffset);
++      }
++      for (i = 0; i + 1 < nr; i += 2) {
++	 voffset =  rmesa->tcl.aos[i + 0].offset +
++	    offset * 4 * rmesa->tcl.aos[i + 0].stride;
++	 radeon_cs_write_reloc(rmesa->radeon.cmdbuf.cs,
++			       rmesa->tcl.aos[i+0].bo,
++			       RADEON_GEM_DOMAIN_GTT,
++			       0, 0);
++	 voffset =  rmesa->tcl.aos[i + 1].offset +
++	    offset * 4 * rmesa->tcl.aos[i + 1].stride;
++	 radeon_cs_write_reloc(rmesa->radeon.cmdbuf.cs,
++			       rmesa->tcl.aos[i+1].bo,
++			       RADEON_GEM_DOMAIN_GTT,
++			       0, 0);
++      }
++      if (nr & 1) {
++	 voffset =  rmesa->tcl.aos[nr - 1].offset +
++	    offset * 4 * rmesa->tcl.aos[nr - 1].stride;
++	 radeon_cs_write_reloc(rmesa->radeon.cmdbuf.cs,
++			       rmesa->tcl.aos[nr-1].bo,
++			       RADEON_GEM_DOMAIN_GTT,
++			       0, 0);
+       }
+    }
+-
+-   if (R200_DEBUG & DEBUG_VERTS) {
+-      fprintf(stderr, "%s:\n", __FUNCTION__);
+-      for (i = 0 ; i < sz ; i++)
+-	 fprintf(stderr, "   %d: %x\n", i, tmp[i]);
+-   }
++   END_BATCH();
+ }
+ 
+-void r200EmitBlit( r200ContextPtr rmesa,
+-		   GLuint color_fmt,
+-		   GLuint src_pitch,
+-		   GLuint src_offset,
+-		   GLuint dst_pitch,
+-		   GLuint dst_offset,
+-		   GLint srcx, GLint srcy,
+-		   GLint dstx, GLint dsty,
+-		   GLuint w, GLuint h )
++void r200FireAOS(r200ContextPtr rmesa, int vertex_count, int type)
+ {
+-   drm_radeon_cmd_header_t *cmd;
++	BATCH_LOCALS(&rmesa->radeon);
+ 
+-   if (R200_DEBUG & DEBUG_IOCTL)
+-      fprintf(stderr, "%s src %x/%x %d,%d dst: %x/%x %d,%d sz: %dx%d\n",
+-	      __FUNCTION__, 
+-	      src_pitch, src_offset, srcx, srcy,
+-	      dst_pitch, dst_offset, dstx, dsty,
+-	      w, h);
+-
+-   assert( (src_pitch & 63) == 0 );
+-   assert( (dst_pitch & 63) == 0 );
+-   assert( (src_offset & 1023) == 0 );
+-   assert( (dst_offset & 1023) == 0 );
+-   assert( w < (1<<16) );
+-   assert( h < (1<<16) );
+-
+-   cmd = (drm_radeon_cmd_header_t *)r200AllocCmdBuf( rmesa, 8 * sizeof(int),
+-						  __FUNCTION__ );
+-
+-
+-   cmd[0].header.cmd_type = RADEON_CMD_PACKET3;
+-   cmd[1].i = R200_CP_CMD_BITBLT_MULTI | (5 << 16);
+-   cmd[2].i = (RADEON_GMC_SRC_PITCH_OFFSET_CNTL |
+-	       RADEON_GMC_DST_PITCH_OFFSET_CNTL |
+-	       RADEON_GMC_BRUSH_NONE |
+-	       (color_fmt << 8) |
+-	       RADEON_GMC_SRC_DATATYPE_COLOR |
+-	       RADEON_ROP3_S |
+-	       RADEON_DP_SRC_SOURCE_MEMORY |
+-	       RADEON_GMC_CLR_CMP_CNTL_DIS |
+-	       RADEON_GMC_WR_MSK_DIS );
+-
+-   cmd[3].i = ((src_pitch/64)<<22) | (src_offset >> 10);
+-   cmd[4].i = ((dst_pitch/64)<<22) | (dst_offset >> 10);
+-   cmd[5].i = (srcx << 16) | srcy;
+-   cmd[6].i = (dstx << 16) | dsty; /* dst */
+-   cmd[7].i = (w << 16) | h;
++	BEGIN_BATCH(3);
++	OUT_BATCH_PACKET3(R200_CP_CMD_3D_DRAW_VBUF_2, 0);
++	OUT_BATCH(R200_VF_PRIM_WALK_LIST | (vertex_count << 16) | type);
++	END_BATCH();
+ }
+ 
+-
+-void r200EmitWait( r200ContextPtr rmesa, GLuint flags )
+-{
+-   drm_radeon_cmd_header_t *cmd;
+-
+-   assert( !(flags & ~(RADEON_WAIT_2D|RADEON_WAIT_3D)) );
+-
+-   cmd = (drm_radeon_cmd_header_t *)r200AllocCmdBuf( rmesa, 1 * sizeof(int),
+-					   __FUNCTION__ );
+-   cmd[0].i = 0;
+-   cmd[0].wait.cmd_type = RADEON_CMD_WAIT;
+-   cmd[0].wait.flags = flags;
+-}
+diff --git a/src/mesa/drivers/dri/r200/r200_context.c b/src/mesa/drivers/dri/r200/r200_context.c
+index 5531e0a..a744469 100644
+--- a/src/mesa/drivers/dri/r200/r200_context.c
++++ b/src/mesa/drivers/dri/r200/r200_context.c
+@@ -54,7 +54,6 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ #include "r200_context.h"
+ #include "r200_ioctl.h"
+ #include "r200_state.h"
+-#include "r200_span.h"
+ #include "r200_pixel.h"
+ #include "r200_tex.h"
+ #include "r200_swtcl.h"
+@@ -62,14 +61,12 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ #include "r200_maos.h"
+ #include "r200_vertprog.h"
+ 
+-#define need_GL_ARB_multisample
+-#define need_GL_ARB_texture_compression
+-#define need_GL_ARB_vertex_buffer_object
++#include "radeon_span.h"
++
+ #define need_GL_ARB_vertex_program
+ #define need_GL_ATI_fragment_shader
+ #define need_GL_EXT_blend_minmax
+ #define need_GL_EXT_fog_coord
+-#define need_GL_EXT_multi_draw_arrays
+ #define need_GL_EXT_secondary_color
+ #define need_GL_EXT_blend_equation_separate
+ #define need_GL_EXT_blend_func_separate
+@@ -82,9 +79,6 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ #include "vblank.h"
+ #include "utils.h"
+ #include "xmlpool.h" /* for symbolic values of enum-type options */
+-#ifndef R200_DEBUG
+-int R200_DEBUG = (0);
+-#endif
+ 
+ /* Return various strings for glGetString().
+  */
+@@ -93,8 +87,8 @@ static const GLubyte *r200GetString( GLcontext *ctx, GLenum name )
+    r200ContextPtr rmesa = R200_CONTEXT(ctx);
+    static char buffer[128];
+    unsigned   offset;
+-   GLuint agp_mode = (rmesa->r200Screen->card_type == RADEON_CARD_PCI)? 0 :
+-      rmesa->r200Screen->AGPMode;
++   GLuint agp_mode = (rmesa->radeon.radeonScreen->card_type == RADEON_CARD_PCI)? 0 :
++      rmesa->radeon.radeonScreen->AGPMode;
+ 
+    switch ( name ) {
+    case GL_VENDOR:
+@@ -105,7 +99,7 @@ static const GLubyte *r200GetString( GLcontext *ctx, GLenum name )
+ 				     agp_mode );
+ 
+       sprintf( & buffer[ offset ], " %sTCL",
+-	       !(rmesa->TclFallback & R200_TCL_FALLBACK_TCL_DISABLE)
++	       !(rmesa->radeon.TclFallback & R200_TCL_FALLBACK_TCL_DISABLE)
+ 	       ? "" : "NO-" );
+ 
+       return (GLubyte *)buffer;
+@@ -120,20 +114,16 @@ static const GLubyte *r200GetString( GLcontext *ctx, GLenum name )
+  */
+ const struct dri_extension card_extensions[] =
+ {
+-    { "GL_ARB_multisample",                GL_ARB_multisample_functions },
+     { "GL_ARB_multitexture",               NULL },
+     { "GL_ARB_texture_border_clamp",       NULL },
+-    { "GL_ARB_texture_compression",        GL_ARB_texture_compression_functions },
+     { "GL_ARB_texture_env_add",            NULL },
+     { "GL_ARB_texture_env_combine",        NULL },
+     { "GL_ARB_texture_env_dot3",           NULL },
+     { "GL_ARB_texture_env_crossbar",       NULL },
+     { "GL_ARB_texture_mirrored_repeat",    NULL },
+-    { "GL_ARB_vertex_buffer_object",       GL_ARB_vertex_buffer_object_functions },
+     { "GL_EXT_blend_minmax",               GL_EXT_blend_minmax_functions },
+     { "GL_EXT_blend_subtract",             NULL },
+     { "GL_EXT_fog_coord",                  GL_EXT_fog_coord_functions },
+-    { "GL_EXT_multi_draw_arrays",          GL_EXT_multi_draw_arrays_functions },
+     { "GL_EXT_secondary_color",            GL_EXT_secondary_color_functions },
+     { "GL_EXT_stencil_wrap",               NULL },
+     { "GL_EXT_texture_edge_clamp",         NULL },
+@@ -242,6 +232,40 @@ static const struct dri_debug_control debug_control[] =
+     { NULL,    0 }
+ };
+ 
++static void r200_get_lock(radeonContextPtr radeon)
++{
++   r200ContextPtr rmesa = (r200ContextPtr)radeon;
++   drm_radeon_sarea_t *sarea = radeon->sarea;
++   int i;
 +
-+struct radeon_bo {
-+    uint32_t                    alignment;
-+    uint32_t                    handle;
-+    uint32_t                    size;
-+    uint32_t                    domains;
-+    uint32_t                    flags;
-+    unsigned                    cref;
-+#ifdef RADEON_BO_TRACK
-+    struct radeon_track         *track;
-+#endif
-+    void                        *ptr;
-+    struct radeon_bo_manager    *bom;
-+    uint32_t                    space_accounted;
-+};
++   R200_STATECHANGE( rmesa, ctx );
++   if (rmesa->radeon.sarea->tiling_enabled) {
++      rmesa->hw.ctx.cmd[CTX_RB3D_COLORPITCH] |= R200_COLOR_TILE_ENABLE;
++   }
++   else rmesa->hw.ctx.cmd[CTX_RB3D_COLORPITCH] &= ~R200_COLOR_TILE_ENABLE;
 +
-+/* bo functions */
-+struct radeon_bo_funcs {
-+    struct radeon_bo *(*bo_open)(struct radeon_bo_manager *bom,
-+                                 uint32_t handle,
-+                                 uint32_t size,
-+                                 uint32_t alignment,
-+                                 uint32_t domains,
-+                                 uint32_t flags);
-+    void (*bo_ref)(struct radeon_bo *bo);
-+    struct radeon_bo *(*bo_unref)(struct radeon_bo *bo);
-+    int (*bo_map)(struct radeon_bo *bo, int write);
-+    int (*bo_unmap)(struct radeon_bo *bo);
-+    int (*bo_wait)(struct radeon_bo *bo);
-+};
++   if ( sarea->ctx_owner != rmesa->radeon.dri.hwContext ) {
++      sarea->ctx_owner = rmesa->radeon.dri.hwContext;
++      if (!radeon->radeonScreen->kernel_mm)
++         radeon_bo_legacy_texture_age(radeon->radeonScreen->bom);
++   }
 +
-+struct radeon_bo_manager {
-+    struct radeon_bo_funcs  *funcs;
-+    int                     fd;
++}
 +
-+#ifdef RADEON_BO_TRACK
-+    struct radeon_tracker   tracker;
-+#endif
-+};
-+    
-+static inline void _radeon_bo_debug(struct radeon_bo *bo,
-+                                    const char *op,
-+                                    const char *file,
-+                                    const char *func,
-+                                    int line)
++static void r200_vtbl_emit_cs_header(struct radeon_cs *cs, radeonContextPtr rmesa)
 +{
-+    fprintf(stderr, "%s %p 0x%08X 0x%08X 0x%08X [%s %s %d]\n",
-+            op, bo, bo->handle, bo->size, bo->cref, file, func, line);
 +}
 +
-+static inline struct radeon_bo *_radeon_bo_open(struct radeon_bo_manager *bom,
-+                                                uint32_t handle,
-+                                                uint32_t size,
-+                                                uint32_t alignment,
-+                                                uint32_t domains,
-+                                                uint32_t flags,
-+                                                const char *file,
-+                                                const char *func,
-+                                                int line)
++
++static void r200_init_vtbl(radeonContextPtr radeon)
 +{
-+    struct radeon_bo *bo;
++   radeon->vtbl.get_lock = r200_get_lock;
++   radeon->vtbl.update_viewport_offset = r200UpdateViewportOffset;
++   radeon->vtbl.update_draw_buffer = r200UpdateDrawBuffer;
++   radeon->vtbl.emit_cs_header = r200_vtbl_emit_cs_header;
++   radeon->vtbl.swtcl_flush = r200_swtcl_flush;
++}
 +
-+    bo = bom->funcs->bo_open(bom, handle, size, alignment, domains, flags);
-+#ifdef RADEON_BO_TRACK
-+    if (bo) {
-+        bo->track = radeon_tracker_add_track(&bom->tracker, bo->handle);
-+        radeon_track_add_event(bo->track, file, func, "open", line);
-+    }
+ 
+ /* Create the device specific rendering context.
+  */
+@@ -253,9 +277,9 @@ GLboolean r200CreateContext( const __GLcontextModes *glVisual,
+    radeonScreenPtr screen = (radeonScreenPtr)(sPriv->private);
+    struct dd_function_table functions;
+    r200ContextPtr rmesa;
+-   GLcontext *ctx, *shareCtx;
++   GLcontext *ctx;
+    int i;
+-   int tcl_mode, fthrottle_mode;
++   int tcl_mode;
+ 
+    assert(glVisual);
+    assert(driContextPriv);
+@@ -265,7 +289,8 @@ GLboolean r200CreateContext( const __GLcontextModes *glVisual,
+    rmesa = (r200ContextPtr) CALLOC( sizeof(*rmesa) );
+    if ( !rmesa )
+       return GL_FALSE;
+-      
++
++   r200_init_vtbl(&rmesa->radeon);
+    /* init exp fog table data */
+    r200InitStaticFogData();
+ 
+@@ -273,12 +298,12 @@ GLboolean r200CreateContext( const __GLcontextModes *glVisual,
+     * Do this here so that initialMaxAnisotropy is set before we create
+     * the default textures.
+     */
+-   driParseConfigFiles (&rmesa->optionCache, &screen->optionCache,
++   driParseConfigFiles (&rmesa->radeon.optionCache, &screen->optionCache,
+ 			screen->driScreen->myNum, "r200");
+-   rmesa->initialMaxAnisotropy = driQueryOptionf(&rmesa->optionCache,
+-                                                 "def_max_anisotropy");
++   rmesa->radeon.initialMaxAnisotropy = driQueryOptionf(&rmesa->radeon.optionCache,
++							"def_max_anisotropy");
+ 
+-   if ( driQueryOptionb( &rmesa->optionCache, "hyperz" ) ) {
++   if ( driQueryOptionb( &rmesa->radeon.optionCache, "hyperz" ) ) {
+       if ( sPriv->drm_version.minor < 13 )
+ 	 fprintf( stderr, "DRM version 1.%d too old to support HyperZ, "
+ 			  "disabling.\n", sPriv->drm_version.minor );
+@@ -299,59 +324,21 @@ GLboolean r200CreateContext( const __GLcontextModes *glVisual,
+    r200InitTextureFuncs(&functions);
+    r200InitShaderFuncs(&functions); 
+ 
+-   /* Allocate and initialize the Mesa context */
+-   if (sharedContextPrivate)
+-      shareCtx = ((r200ContextPtr) sharedContextPrivate)->glCtx;
+-   else
+-      shareCtx = NULL;
+-   rmesa->glCtx = _mesa_create_context(glVisual, shareCtx,
+-                                       &functions, (void *) rmesa);
+-   if (!rmesa->glCtx) {
+-      FREE(rmesa);
+-      return GL_FALSE;
+-   }
+-   driContextPriv->driverPrivate = rmesa;
+-
+-   /* Init r200 context data */
+-   rmesa->dri.context = driContextPriv;
+-   rmesa->dri.screen = sPriv;
+-   rmesa->dri.drawable = NULL; /* Set by XMesaMakeCurrent */
+-   rmesa->dri.hwContext = driContextPriv->hHWContext;
+-   rmesa->dri.hwLock = &sPriv->pSAREA->lock;
+-   rmesa->dri.fd = sPriv->fd;
+-   rmesa->dri.drmMinor = sPriv->drm_version.minor;
+-
+-   rmesa->r200Screen = screen;
+-   rmesa->sarea = (drm_radeon_sarea_t *)((GLubyte *)sPriv->pSAREA +
+-				       screen->sarea_priv_offset);
+-
+-
+-   rmesa->dma.buf0_address = rmesa->r200Screen->buffers->list[0].address;
+-
+-   (void) memset( rmesa->texture_heaps, 0, sizeof( rmesa->texture_heaps ) );
+-   make_empty_list( & rmesa->swapped );
+-
+-   rmesa->nr_heaps = 1 /* screen->numTexHeaps */ ;
+-   assert(rmesa->nr_heaps < RADEON_NR_TEX_HEAPS);
+-   for ( i = 0 ; i < rmesa->nr_heaps ; i++ ) {
+-      rmesa->texture_heaps[i] = driCreateTextureHeap( i, rmesa,
+-	    screen->texSize[i],
+-	    12,
+-	    RADEON_NR_TEX_REGIONS,
+-	    (drmTextureRegionPtr)rmesa->sarea->tex_list[i],
+-	    & rmesa->sarea->tex_age[i],
+-	    & rmesa->swapped,
+-	    sizeof( r200TexObj ),
+-	    (destroy_texture_object_t *) r200DestroyTexObj );
++   if (!radeonInitContext(&rmesa->radeon, &functions,
++			  glVisual, driContextPriv,
++			  sharedContextPrivate)) {
++     FREE(rmesa);
++     return GL_FALSE;
+    }
+-   rmesa->texture_depth = driQueryOptioni (&rmesa->optionCache,
++
++   rmesa->radeon.texture_depth = driQueryOptioni (&rmesa->radeon.optionCache,
+ 					   "texture_depth");
+-   if (rmesa->texture_depth == DRI_CONF_TEXTURE_DEPTH_FB)
+-      rmesa->texture_depth = ( screen->cpp == 4 ) ?
++   if (rmesa->radeon.texture_depth == DRI_CONF_TEXTURE_DEPTH_FB)
++      rmesa->radeon.texture_depth = ( screen->cpp == 4 ) ?
+ 	 DRI_CONF_TEXTURE_DEPTH_32 : DRI_CONF_TEXTURE_DEPTH_16;
+ 
+-   rmesa->swtcl.RenderIndex = ~0;
+-   rmesa->hw.all_dirty = 1;
++   rmesa->radeon.swtcl.RenderIndex = ~0;
++   rmesa->radeon.hw.all_dirty = 1;
+ 
+    /* Set the maximum texture size small enough that we can guarentee that
+     * all texture units can bind a maximal texture and have all of them in
+@@ -359,29 +346,13 @@ GLboolean r200CreateContext( const __GLcontextModes *glVisual,
+     * setting allow larger textures.
+     */
+ 
+-   ctx = rmesa->glCtx;
+-   ctx->Const.MaxTextureUnits = driQueryOptioni (&rmesa->optionCache,
++   ctx = rmesa->radeon.glCtx;
++   ctx->Const.MaxTextureUnits = driQueryOptioni (&rmesa->radeon.optionCache,
+ 						 "texture_units");
+    ctx->Const.MaxTextureImageUnits = ctx->Const.MaxTextureUnits;
+    ctx->Const.MaxTextureCoordUnits = ctx->Const.MaxTextureUnits;
+ 
+-   i = driQueryOptioni( &rmesa->optionCache, "allow_large_textures");
+-
+-   driCalculateMaxTextureLevels( rmesa->texture_heaps,
+-				 rmesa->nr_heaps,
+-				 & ctx->Const,
+-				 4,
+-				 11, /* max 2D texture size is 2048x2048 */
+-#if ENABLE_HW_3D_TEXTURE
+-				 8,  /* max 3D texture size is 256^3 */
+-#else
+-				 0,  /* 3D textures unsupported */
+-#endif
+-				 11, /* max cube texture size is 2048x2048 */
+-				 11, /* max texture rectangle size is 2048x2048 */
+-				 12,
+-				 GL_FALSE,
+-				 i );
++   i = driQueryOptioni( &rmesa->radeon.optionCache, "allow_large_textures");
+ 
+    ctx->Const.MaxTextureMaxAnisotropy = 16.0;
+ 
+@@ -391,7 +362,7 @@ GLboolean r200CreateContext( const __GLcontextModes *glVisual,
+    ctx->Const.MinPointSizeAA = 1.0;
+    ctx->Const.MaxPointSizeAA = 1.0;
+    ctx->Const.PointSizeGranularity = 0.0625;
+-   if (rmesa->r200Screen->drmSupportsPointSprites)
++   if (rmesa->radeon.radeonScreen->drmSupportsPointSprites)
+       ctx->Const.MaxPointSize = 2047.0;
+    else
+       ctx->Const.MaxPointSize = 1.0;
+@@ -411,6 +382,8 @@ GLboolean r200CreateContext( const __GLcontextModes *glVisual,
+    ctx->Const.VertexProgram.MaxNativeParameters = R200_VSF_MAX_PARAM;
+    ctx->Const.VertexProgram.MaxNativeAddressRegs = 1;
+ 
++   ctx->Const.MaxDrawBuffers = 1;
++
+    /* Initialize the software rasterizer and helper modules.
+     */
+    _swrast_CreateContext( ctx );
+@@ -445,32 +418,32 @@ GLboolean r200CreateContext( const __GLcontextModes *glVisual,
+    _math_matrix_set_identity( &rmesa->tmpmat );
+ 
+    driInitExtensions( ctx, card_extensions, GL_TRUE );
+-   if (!(rmesa->r200Screen->chip_flags & R200_CHIPSET_YCBCR_BROKEN)) {
++   if (!(rmesa->radeon.radeonScreen->chip_flags & R200_CHIPSET_YCBCR_BROKEN)) {
+      /* yuv textures don't work with some chips - R200 / rv280 okay so far
+ 	others get the bit ordering right but don't actually do YUV-RGB conversion */
+       _mesa_enable_extension( ctx, "GL_MESA_ycbcr_texture" );
+    }
+-   if (rmesa->glCtx->Mesa_DXTn) {
++   if (rmesa->radeon.glCtx->Mesa_DXTn) {
+       _mesa_enable_extension( ctx, "GL_EXT_texture_compression_s3tc" );
+       _mesa_enable_extension( ctx, "GL_S3_s3tc" );
+    }
+-   else if (driQueryOptionb (&rmesa->optionCache, "force_s3tc_enable")) {
++   else if (driQueryOptionb (&rmesa->radeon.optionCache, "force_s3tc_enable")) {
+       _mesa_enable_extension( ctx, "GL_EXT_texture_compression_s3tc" );
+    }
+ 
+-   if (rmesa->r200Screen->drmSupportsCubeMapsR200)
++   if (rmesa->radeon.radeonScreen->drmSupportsCubeMapsR200)
+       _mesa_enable_extension( ctx, "GL_ARB_texture_cube_map" );
+-   if (rmesa->r200Screen->drmSupportsBlendColor) {
++   if (rmesa->radeon.radeonScreen->drmSupportsBlendColor) {
+        driInitExtensions( ctx, blend_extensions, GL_FALSE );
+    }
+-   if(rmesa->r200Screen->drmSupportsVertexProgram)
++   if(rmesa->radeon.radeonScreen->drmSupportsVertexProgram)
+       driInitSingleExtension( ctx, ARB_vp_extension );
+-   if(driQueryOptionb(&rmesa->optionCache, "nv_vertex_program"))
++   if(driQueryOptionb(&rmesa->radeon.optionCache, "nv_vertex_program"))
+       driInitSingleExtension( ctx, NV_vp_extension );
+ 
+-   if ((ctx->Const.MaxTextureUnits == 6) && rmesa->r200Screen->drmSupportsFragShader)
++   if ((ctx->Const.MaxTextureUnits == 6) && rmesa->radeon.radeonScreen->drmSupportsFragShader)
+       driInitSingleExtension( ctx, ATI_fs_extension );
+-   if (rmesa->r200Screen->drmSupportsPointSprites)
++   if (rmesa->radeon.radeonScreen->drmSupportsPointSprites)
+       driInitExtensions( ctx, point_extensions, GL_FALSE );
+ #if 0
+    r200InitDriverFuncs( ctx );
+@@ -480,33 +453,15 @@ GLboolean r200CreateContext( const __GLcontextModes *glVisual,
+ #endif
+    /* plug in a few more device driver functions */
+    /* XXX these should really go right after _mesa_init_driver_functions() */
++   radeonInitSpanFuncs( ctx );
+    r200InitPixelFuncs( ctx );
+-   r200InitSpanFuncs( ctx );
+    r200InitTnlFuncs( ctx );
+    r200InitState( rmesa );
+    r200InitSwtcl( ctx );
+ 
+-   fthrottle_mode = driQueryOptioni(&rmesa->optionCache, "fthrottle_mode");
+-   rmesa->iw.irq_seq = -1;
+-   rmesa->irqsEmitted = 0;
+-   rmesa->do_irqs = (fthrottle_mode == DRI_CONF_FTHROTTLE_IRQS &&
+-		     rmesa->r200Screen->irq);
+-
+-   rmesa->do_usleeps = (fthrottle_mode == DRI_CONF_FTHROTTLE_USLEEPS);
+-
+-   if (!rmesa->do_irqs)
+-      fprintf(stderr,
+-	      "IRQ's not enabled, falling back to %s: %d %d\n",
+-	      rmesa->do_usleeps ? "usleeps" : "busy waits",
+-	      fthrottle_mode,
+-	      rmesa->r200Screen->irq);
+-
+    rmesa->prefer_gart_client_texturing = 
+       (getenv("R200_GART_CLIENT_TEXTURES") != 0);
+ 
+-   (*sPriv->systemTime->getUST)( & rmesa->swap_ust );
+-
+-
+ #if DO_DEBUG
+    R200_DEBUG  = driParseDebugString( getenv( "R200_DEBUG" ),
+ 				      debug_control );
+@@ -514,18 +469,18 @@ GLboolean r200CreateContext( const __GLcontextModes *glVisual,
+ 				      debug_control );
+ #endif
+ 
+-   tcl_mode = driQueryOptioni(&rmesa->optionCache, "tcl_mode");
+-   if (driQueryOptionb(&rmesa->optionCache, "no_rast")) {
++   tcl_mode = driQueryOptioni(&rmesa->radeon.optionCache, "tcl_mode");
++   if (driQueryOptionb(&rmesa->radeon.optionCache, "no_rast")) {
+       fprintf(stderr, "disabling 3D acceleration\n");
+       FALLBACK(rmesa, R200_FALLBACK_DISABLE, 1);
+    }
+    else if (tcl_mode == DRI_CONF_TCL_SW || getenv("R200_NO_TCL") ||
+-	    !(rmesa->r200Screen->chip_flags & RADEON_CHIPSET_TCL)) {
+-      if (rmesa->r200Screen->chip_flags & RADEON_CHIPSET_TCL) {
+-	 rmesa->r200Screen->chip_flags &= ~RADEON_CHIPSET_TCL;
++	    !(rmesa->radeon.radeonScreen->chip_flags & RADEON_CHIPSET_TCL)) {
++      if (rmesa->radeon.radeonScreen->chip_flags & RADEON_CHIPSET_TCL) {
++	 rmesa->radeon.radeonScreen->chip_flags &= ~RADEON_CHIPSET_TCL;
+ 	 fprintf(stderr, "Disabling HW TCL support\n");
+       }
+-      TCL_FALLBACK(rmesa->glCtx, R200_TCL_FALLBACK_TCL_DISABLE, 1);
++      TCL_FALLBACK(rmesa->radeon.glCtx, R200_TCL_FALLBACK_TCL_DISABLE, 1);
+    }
+ 
+    return GL_TRUE;
+@@ -544,55 +499,33 @@ void r200DestroyContext( __DRIcontextPrivate *driContextPriv )
+ 
+    /* check if we're deleting the currently bound context */
+    if (rmesa == current) {
+-      R200_FIREVERTICES( rmesa );
++      radeon_firevertices(&rmesa->radeon);
+       _mesa_make_current(NULL, NULL, NULL);
+    }
+ 
+    /* Free r200 context resources */
+    assert(rmesa); /* should never be null */
+    if ( rmesa ) {
+-      GLboolean   release_texture_heaps;
+ 
++      _swsetup_DestroyContext( rmesa->radeon.glCtx );
++      _tnl_DestroyContext( rmesa->radeon.glCtx );
++      _vbo_DestroyContext( rmesa->radeon.glCtx );
++      _swrast_DestroyContext( rmesa->radeon.glCtx );
+ 
+-      release_texture_heaps = (rmesa->glCtx->Shared->RefCount == 1);
+-      _swsetup_DestroyContext( rmesa->glCtx );
+-      _tnl_DestroyContext( rmesa->glCtx );
+-      _vbo_DestroyContext( rmesa->glCtx );
+-      _swrast_DestroyContext( rmesa->glCtx );
++      r200DestroySwtcl( rmesa->radeon.glCtx );
++      r200ReleaseArrays( rmesa->radeon.glCtx, ~0 );
+ 
+-      r200DestroySwtcl( rmesa->glCtx );
+-      r200ReleaseArrays( rmesa->glCtx, ~0 );
+-
+-      if (rmesa->dma.current.buf) {
+-	 r200ReleaseDmaRegion( rmesa, &rmesa->dma.current, __FUNCTION__ );
+-	 r200FlushCmdBuf( rmesa, __FUNCTION__ );
+-      }
+-
+-      if (rmesa->state.scissor.pClipRects) {
+-	 FREE(rmesa->state.scissor.pClipRects);
+-	 rmesa->state.scissor.pClipRects = NULL;
++      if (rmesa->radeon.dma.current) {
++	 radeonReleaseDmaRegion( &rmesa->radeon );
++	 rcommonFlushCmdBuf( &rmesa->radeon, __FUNCTION__ );
+       }
+ 
+-      if ( release_texture_heaps ) {
+-         /* This share group is about to go away, free our private
+-          * texture object data.
+-          */
+-         int i;
+-
+-         for ( i = 0 ; i < rmesa->nr_heaps ; i++ ) {
+-	    driDestroyTextureHeap( rmesa->texture_heaps[ i ] );
+-	    rmesa->texture_heaps[ i ] = NULL;
+-         }
+-
+-	 assert( is_empty_list( & rmesa->swapped ) );
++      if (rmesa->radeon.state.scissor.pClipRects) {
++	 FREE(rmesa->radeon.state.scissor.pClipRects);
++	 rmesa->radeon.state.scissor.pClipRects = NULL;
+       }
+ 
+-      /* free the Mesa context */
+-      rmesa->glCtx->DriverCtx = NULL;
+-      _mesa_destroy_context( rmesa->glCtx );
+-
+-      /* free the option cache */
+-      driDestroyOptionCache (&rmesa->optionCache);
++      radeonCleanupContext(&rmesa->radeon);
+ 
+       FREE( rmesa );
+    }
+@@ -600,107 +533,6 @@ void r200DestroyContext( __DRIcontextPrivate *driContextPriv )
+ 
+ 
+ 
+-
+-void
+-r200SwapBuffers( __DRIdrawablePrivate *dPriv )
+-{
+-   if (dPriv->driContextPriv && dPriv->driContextPriv->driverPrivate) {
+-      r200ContextPtr rmesa;
+-      GLcontext *ctx;
+-      rmesa = (r200ContextPtr) dPriv->driContextPriv->driverPrivate;
+-      ctx = rmesa->glCtx;
+-      if (ctx->Visual.doubleBufferMode) {
+-         _mesa_notifySwapBuffers( ctx );  /* flush pending rendering comands */
+-         if ( rmesa->doPageFlip ) {
+-            r200PageFlip( dPriv );
+-         }
+-         else {
+-	     r200CopyBuffer( dPriv, NULL );
+-         }
+-      }
+-   }
+-   else {
+-      /* XXX this shouldn't be an error but we can't handle it for now */
+-      _mesa_problem(NULL, "%s: drawable has no context!", __FUNCTION__);
+-   }
+-}
+-
+-void
+-r200CopySubBuffer( __DRIdrawablePrivate *dPriv,
+-		   int x, int y, int w, int h )
+-{
+-   if (dPriv->driContextPriv && dPriv->driContextPriv->driverPrivate) {
+-      r200ContextPtr rmesa;
+-      GLcontext *ctx;
+-      rmesa = (r200ContextPtr) dPriv->driContextPriv->driverPrivate;
+-      ctx = rmesa->glCtx;
+-      if (ctx->Visual.doubleBufferMode) {
+-	 drm_clip_rect_t rect;
+-	 rect.x1 = x + dPriv->x;
+-	 rect.y1 = (dPriv->h - y - h) + dPriv->y;
+-	 rect.x2 = rect.x1 + w;
+-	 rect.y2 = rect.y1 + h;
+-         _mesa_notifySwapBuffers( ctx );  /* flush pending rendering comands */
+-	 r200CopyBuffer( dPriv, &rect );
+-      }
+-   }
+-   else {
+-      /* XXX this shouldn't be an error but we can't handle it for now */
+-      _mesa_problem(NULL, "%s: drawable has no context!", __FUNCTION__);
+-   }
+-}
+-
+-/* Force the context `c' to be the current context and associate with it
+- * buffer `b'.
+- */
+-GLboolean
+-r200MakeCurrent( __DRIcontextPrivate *driContextPriv,
+-                   __DRIdrawablePrivate *driDrawPriv,
+-                   __DRIdrawablePrivate *driReadPriv )
+-{
+-   if ( driContextPriv ) {
+-      r200ContextPtr newCtx = 
+-	 (r200ContextPtr) driContextPriv->driverPrivate;
+-
+-      if (R200_DEBUG & DEBUG_DRI)
+-	 fprintf(stderr, "%s ctx %p\n", __FUNCTION__, (void *)newCtx->glCtx);
+-
+-      newCtx->dri.readable = driReadPriv;
+-
+-      if ( newCtx->dri.drawable != driDrawPriv ||
+-           newCtx->lastStamp != driDrawPriv->lastStamp ) {
+-	 if (driDrawPriv->swap_interval == (unsigned)-1) {
+-	    driDrawPriv->vblFlags = (newCtx->r200Screen->irq != 0)
+-	       ? driGetDefaultVBlankFlags(&newCtx->optionCache)
+-	       : VBLANK_FLAG_NO_IRQ;
+-
+-	    driDrawableInitVBlank( driDrawPriv );
+-	 }
+-
+-	 newCtx->dri.drawable = driDrawPriv;
+-
+-	 r200SetCliprects(newCtx);
+-	 r200UpdateViewportOffset( newCtx->glCtx );
+-      }
+-
+-      _mesa_make_current( newCtx->glCtx,
+-			  (GLframebuffer *) driDrawPriv->driverPrivate,
+-			  (GLframebuffer *) driReadPriv->driverPrivate );
+-
+-      _mesa_update_state( newCtx->glCtx );
+-      r200ValidateState( newCtx->glCtx );
+-
+-   } else {
+-      if (R200_DEBUG & DEBUG_DRI)
+-	 fprintf(stderr, "%s ctx is null\n", __FUNCTION__);
+-      _mesa_make_current( NULL, NULL, NULL );
+-   }
+-
+-   if (R200_DEBUG & DEBUG_DRI)
+-      fprintf(stderr, "End %s\n", __FUNCTION__);
+-   return GL_TRUE;
+-}
+-
+ /* Force the context `c' to be unbound from its buffer.
+  */
+ GLboolean
+@@ -709,7 +541,7 @@ r200UnbindContext( __DRIcontextPrivate *driContextPriv )
+    r200ContextPtr rmesa = (r200ContextPtr) driContextPriv->driverPrivate;
+ 
+    if (R200_DEBUG & DEBUG_DRI)
+-      fprintf(stderr, "%s ctx %p\n", __FUNCTION__, (void *)rmesa->glCtx);
++      fprintf(stderr, "%s ctx %p\n", __FUNCTION__, (void *)rmesa->radeon.glCtx);
+ 
+    return GL_TRUE;
+ }
+diff --git a/src/mesa/drivers/dri/r200/r200_context.h b/src/mesa/drivers/dri/r200/r200_context.h
+index 14a1dda..fcbe725 100644
+--- a/src/mesa/drivers/dri/r200/r200_context.h
++++ b/src/mesa/drivers/dri/r200/r200_context.h
+@@ -53,51 +53,17 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ #error This driver requires a newer libdrm to compile
+ #endif
+ 
++#include "radeon_screen.h"
++#include "radeon_common.h"
++
++#include "radeon_lock.h"
++
+ struct r200_context;
+ typedef struct r200_context r200ContextRec;
+ typedef struct r200_context *r200ContextPtr;
+ 
+-/* This union is used to avoid warnings/miscompilation
+-   with float to uint32_t casts due to strict-aliasing */
+-typedef union { GLfloat f; uint32_t ui32; } float_ui32_type;
+-
+-#include "r200_lock.h"
+-#include "radeon_screen.h"
+ #include "main/mm.h"
+ 
+-/* Flags for software fallback cases */
+-/* See correponding strings in r200_swtcl.c */
+-#define R200_FALLBACK_TEXTURE           0x01
+-#define R200_FALLBACK_DRAW_BUFFER       0x02
+-#define R200_FALLBACK_STENCIL           0x04
+-#define R200_FALLBACK_RENDER_MODE       0x08
+-#define R200_FALLBACK_DISABLE           0x10
+-#define R200_FALLBACK_BORDER_MODE       0x20
+-
+-/* The blit width for texture uploads
+- */
+-#define BLIT_WIDTH_BYTES 1024
+-
+-/* Use the templated vertex format:
+- */
+-#define COLOR_IS_RGBA
+-#define TAG(x) r200##x
+-#include "tnl_dd/t_dd_vertex.h"
+-#undef TAG
+-
+-typedef void (*r200_tri_func)( r200ContextPtr,
+-				 r200Vertex *,
+-				 r200Vertex *,
+-				 r200Vertex * );
+-
+-typedef void (*r200_line_func)( r200ContextPtr,
+-				  r200Vertex *,
+-				  r200Vertex * );
+-
+-typedef void (*r200_point_func)( r200ContextPtr,
+-				   r200Vertex * );
+-
+-
+ struct r200_vertex_program {
+         struct gl_vertex_program mesa_program; /* Must be first */
+         int translated;
+@@ -112,93 +78,11 @@ struct r200_vertex_program {
+         int fogmode;
+ };
+ 
+-struct r200_colorbuffer_state {
+-   GLuint clear;
+-#if 000
+-   GLint drawOffset, drawPitch;
+-#endif
+-   int roundEnable;
+-};
+-
+-
+-struct r200_depthbuffer_state {
+-   GLuint clear;
+-   GLfloat scale;
+-};
+-
+-#if 000
+-struct r200_pixel_state {
+-   GLint readOffset, readPitch;
+-};
+-#endif
+-
+-struct r200_scissor_state {
+-   drm_clip_rect_t rect;
+-   GLboolean enabled;
+-
+-   GLuint numClipRects;			/* Cliprects active */
+-   GLuint numAllocedClipRects;		/* Cliprects available */
+-   drm_clip_rect_t *pClipRects;
+-};
+-
+-struct r200_stencilbuffer_state {
+-   GLboolean hwBuffer;
+-   GLuint clear;			/* rb3d_stencilrefmask value */
+-};
+-
+-struct r200_stipple_state {
+-   GLuint mask[32];
+-};
+-
+-
+-
+-#define TEX_0   0x1
+-#define TEX_1   0x2
+-#define TEX_2	0x4
+-#define TEX_3	0x8
+-#define TEX_4	0x10
+-#define TEX_5	0x20
+-#define TEX_ALL 0x3f
+-
+-typedef struct r200_tex_obj r200TexObj, *r200TexObjPtr;
+-
+-/* Texture object in locally shared texture space.
+- */
+-struct r200_tex_obj {
+-   driTextureObject   base;
+-
+-   GLuint bufAddr;			/* Offset to start of locally
+-					   shared texture block */
+-
+-   GLuint dirty_state;		        /* Flags (1 per texunit) for
+-					   whether or not this texobj
+-					   has dirty hardware state
+-					   (pp_*) that needs to be
+-					   brought into the
+-					   texunit. */
+-
+-   drm_radeon_tex_image_t image[6][RADEON_MAX_TEXTURE_LEVELS];
+-					/* Six, for the cube faces */
+-   GLboolean image_override;		/* Image overridden by GLX_EXT_tfp */
+-
+-   GLuint pp_txfilter;		        /* hardware register values */
+-   GLuint pp_txformat;
+-   GLuint pp_txformat_x;
+-   GLuint pp_txoffset;		        /* Image location in texmem.
+-					   All cube faces follow. */
+-   GLuint pp_txsize;		        /* npot only */
+-   GLuint pp_txpitch;		        /* npot only */
+-   GLuint pp_border_color;
+-   GLuint pp_cubic_faces;	        /* cube face 1,2,3,4 log2 sizes */
+-
+-   GLboolean  border_fallback;
+-
+-   GLuint tile_bits;			/* hw texture tile bits used on this texture */
+-};
++#define R200_TEX_ALL 0x3f
+ 
+ 
+ struct r200_texture_env_state {
+-   r200TexObjPtr texobj;
++   radeonTexObjPtr texobj;
+    GLuint outputreg;
+    GLuint unitneeded;
+ };
+@@ -210,19 +94,6 @@ struct r200_texture_state {
+ };
+ 
+ 
+-struct r200_state_atom {
+-   struct r200_state_atom *next, *prev;
+-   const char *name;		         /* for debug */
+-   int cmd_size;		         /* size in bytes */
+-   GLuint idx;
+-   int *cmd;			         /* one or more cmd's */
+-   int *lastcmd;			 /* one or more cmd's */
+-   GLboolean dirty;
+-   GLboolean (*check)( GLcontext *, int );    /* is this state active? */
+-};
+-   
+-
+-
+ /* Trying to keep these relatively short as the variables are becoming
+  * extravagently long.  Drop the driver name prefix off the front of
+  * everything - I think we know which driver we're in by now, and keep the
+@@ -597,181 +468,85 @@ struct r200_state_atom {
+ 
+ 
+ struct r200_hw_state {
+-   /* Head of the linked list of state atoms. */
+-   struct r200_state_atom atomlist;
+-
+    /* Hardware state, stored as cmdbuf commands:  
+     *   -- Need to doublebuffer for
+     *           - reviving state after loss of context
+     *           - eliding noop statechange loops? (except line stipple count)
+     */
+-   struct r200_state_atom ctx;
+-   struct r200_state_atom set;
+-   struct r200_state_atom vte;
+-   struct r200_state_atom lin;
+-   struct r200_state_atom msk;
+-   struct r200_state_atom vpt;
+-   struct r200_state_atom vap;
+-   struct r200_state_atom vtx;
+-   struct r200_state_atom tcl;
+-   struct r200_state_atom msl;
+-   struct r200_state_atom tcg;
+-   struct r200_state_atom msc;
+-   struct r200_state_atom cst;
+-   struct r200_state_atom tam;
+-   struct r200_state_atom tf;
+-   struct r200_state_atom tex[6];
+-   struct r200_state_atom cube[6];
+-   struct r200_state_atom zbs;
+-   struct r200_state_atom mtl[2];
+-   struct r200_state_atom mat[9];
+-   struct r200_state_atom lit[8]; /* includes vec, scl commands */
+-   struct r200_state_atom ucp[6];
+-   struct r200_state_atom pix[6]; /* pixshader stages */
+-   struct r200_state_atom eye; /* eye pos */
+-   struct r200_state_atom grd; /* guard band clipping */
+-   struct r200_state_atom fog;
+-   struct r200_state_atom glt;
+-   struct r200_state_atom prf;
+-   struct r200_state_atom afs[2];
+-   struct r200_state_atom pvs;
+-   struct r200_state_atom vpi[2];
+-   struct r200_state_atom vpp[2];
+-   struct r200_state_atom atf;
+-   struct r200_state_atom spr;
+-   struct r200_state_atom ptp;
+-
+-   int max_state_size;	/* Number of bytes necessary for a full state emit. */
+-   GLboolean is_dirty, all_dirty;
++   struct radeon_state_atom ctx;
++   struct radeon_state_atom set;
++   struct radeon_state_atom vte;
++   struct radeon_state_atom lin;
++   struct radeon_state_atom msk;
++   struct radeon_state_atom vpt;
++   struct radeon_state_atom vap;
++   struct radeon_state_atom vtx;
++   struct radeon_state_atom tcl;
++   struct radeon_state_atom msl;
++   struct radeon_state_atom tcg;
++   struct radeon_state_atom msc;
++   struct radeon_state_atom cst;
++   struct radeon_state_atom tam;
++   struct radeon_state_atom tf;
++   struct radeon_state_atom tex[6];
++   struct radeon_state_atom cube[6];
++   struct radeon_state_atom zbs;
++   struct radeon_state_atom mtl[2];
++   struct radeon_state_atom mat[9];
++   struct radeon_state_atom lit[8]; /* includes vec, scl commands */
++   struct radeon_state_atom ucp[6];
++   struct radeon_state_atom pix[6]; /* pixshader stages */
++   struct radeon_state_atom eye; /* eye pos */
++   struct radeon_state_atom grd; /* guard band clipping */
++   struct radeon_state_atom fog;
++   struct radeon_state_atom glt;
++   struct radeon_state_atom prf;
++   struct radeon_state_atom afs[2];
++   struct radeon_state_atom pvs;
++   struct radeon_state_atom vpi[2];
++   struct radeon_state_atom vpp[2];
++   struct radeon_state_atom atf;
++   struct radeon_state_atom spr;
++   struct radeon_state_atom ptp;
+ };
+ 
+ struct r200_state {
+    /* Derived state for internal purposes:
+     */
+-   struct r200_colorbuffer_state color;
+-   struct r200_depthbuffer_state depth;
+-#if 00
+-   struct r200_pixel_state pixel;
+-#endif
+-   struct r200_scissor_state scissor;
+-   struct r200_stencilbuffer_state stencil;
+-   struct r200_stipple_state stipple;
++   struct radeon_stipple_state stipple;
+    struct r200_texture_state texture;
+    GLuint envneeded;
+ };
+ 
+-/* Need refcounting on dma buffers:
+- */
+-struct r200_dma_buffer {
+-   int refcount;		/* the number of retained regions in buf */
+-   drmBufPtr buf;
+-};
+-
+-#define GET_START(rvb) (rmesa->r200Screen->gart_buffer_offset +		\
+-			(rvb)->address - rmesa->dma.buf0_address +	\
+-			(rvb)->start)
+-
+-/* A retained region, eg vertices for indexed vertices.
+- */
+-struct r200_dma_region {
+-   struct r200_dma_buffer *buf;
+-   char *address;		/* == buf->address */
+-   int start, end, ptr;		/* offsets from start of buf */
+-   int aos_start;
+-   int aos_stride;
+-   int aos_size;
+-};
+-
+-
+-struct r200_dma {
+-   /* Active dma region.  Allocations for vertices and retained
+-    * regions come from here.  Also used for emitting random vertices,
+-    * these may be flushed by calling flush_current();
+-    */
+-   struct r200_dma_region current;
+-   
+-   void (*flush)( r200ContextPtr );
+-
+-   char *buf0_address;		/* start of buf[0], for index calcs */
+-   GLuint nr_released_bufs;	/* flush after so many buffers released */
+-};
+-
+-struct r200_dri_mirror {
+-   __DRIcontextPrivate	*context;	/* DRI context */
+-   __DRIscreenPrivate	*screen;	/* DRI screen */
+-   __DRIdrawablePrivate	*drawable;	/* DRI drawable bound to this ctx */
+-   __DRIdrawablePrivate	*readable;	/* DRI readable bound to this ctx */
+-
+-   drm_context_t hwContext;
+-   drm_hw_lock_t *hwLock;
+-   int fd;
+-   int drmMinor;
+-};
+-
+-
+ #define R200_CMD_BUF_SZ  (16*1024) 
+ 
+-struct r200_store {
+-   GLuint statenr;
+-   GLuint primnr;
+-   char cmd_buf[R200_CMD_BUF_SZ];
+-   int cmd_used;   
+-   int elts_start;
+-};
+-
+-
++#define R200_ELT_BUF_SZ  (16*1024) 
+ /* r200_tcl.c
+  */
+ struct r200_tcl_info {
+    GLuint hw_primitive;
+ 
+ /* hw can handle 12 components max */
+-   struct r200_dma_region *aos_components[12];
++  struct radeon_aos aos[12];
+    GLuint nr_aos_components;
+ 
+    GLuint *Elts;
+ 
+-   struct r200_dma_region indexed_verts;
+-   struct r200_dma_region vertex_data[15];
++   struct radeon_bo *elt_dma_bo;
++   int elt_dma_offset; /** Offset into this buffer object, in bytes */
++   int elt_used;
++
+ };
+ 
+ 
+ /* r200_swtcl.c
+  */
+ struct r200_swtcl_info {
+-   GLuint RenderIndex;
+-   
+-   /**
+-    * Size of a hardware vertex.  This is calculated when \c ::vertex_attrs is
+-    * installed in the Mesa state vector.
+-    */
+-   GLuint vertex_size;
+ 
+-   /**
+-    * Attributes instructing the Mesa TCL pipeline where / how to put vertex
+-    * data in the hardware buffer.
+-    */
+-   struct tnl_attr_map vertex_attrs[VERT_ATTRIB_MAX];
+ 
+-   /**
+-    * Number of elements of \c ::vertex_attrs that are actually used.
+-    */
+-   GLuint vertex_attr_count;
+-
+-   /**
+-    * Cached pointer to the buffer where Mesa will store vertex data.
+-    */
+-   GLubyte *verts;
+-
+-   /* Fallback rasterization functions
+-    */
+-   r200_point_func draw_point;
+-   r200_line_func draw_line;
+-   r200_tri_func draw_tri;
+-
+-   GLuint hw_primitive;
+-   GLenum render_primitive;
+-   GLuint numverts;
++   radeon_point_func draw_point;
++   radeon_line_func draw_line;
++   radeon_tri_func draw_tri;
+ 
+    /**
+     * Offset of the 4UB color data within a hardware (swtcl) vertex.
+@@ -787,27 +562,10 @@ struct r200_swtcl_info {
+     * Should Mesa project vertex data or will the hardware do it?
+     */
+    GLboolean needproj;
+-
+-   struct r200_dma_region indexed_verts;
+-};
+-
+-
+-struct r200_ioctl {
+-   GLuint vertex_offset;
+-   GLuint vertex_size;
+ };
+ 
+ 
+ 
+-#define R200_MAX_PRIMS 64
+-
+-
+-
+-struct r200_prim {
+-   GLuint start;
+-   GLuint end;
+-   GLuint prim;
+-};
+ 
+    /* A maximum total of 29 elements per vertex:  3 floats for position, 3
+     * floats for normal, 4 floats for color, 4 bytes for secondary color,
+@@ -822,9 +580,8 @@ struct r200_prim {
+ 
+ #define R200_MAX_VERTEX_SIZE ((3*6)+11)
+ 
+-
+ struct r200_context {
+-   GLcontext *glCtx;			/* Mesa context */
++   struct radeon_context radeon;
+ 
+    /* Driver and hardware state management
+     */
+@@ -832,56 +589,15 @@ struct r200_context {
+    struct r200_state state;
+    struct r200_vertex_program *curr_vp_hw;
+ 
+-   /* Texture object bookkeeping
+-    */
+-   unsigned              nr_heaps;
+-   driTexHeap          * texture_heaps[ RADEON_NR_TEX_HEAPS ];
+-   driTextureObject      swapped;
+-   int                   texture_depth;
+-   float                 initialMaxAnisotropy;
+-
+-   /* Rasterization and vertex state:
+-    */
+-   GLuint TclFallback;
+-   GLuint Fallback;
+-   GLuint NewGLState;
+-   DECLARE_RENDERINPUTS(tnl_index_bitset);	/* index of bits for last tnl_install_attrs */
+-
+    /* Vertex buffers
+     */
+-   struct r200_ioctl ioctl;
+-   struct r200_dma dma;
+-   struct r200_store store;
+-   /* A full state emit as of the first state emit in the main store, in case
+-    * the context is lost.
+-    */
+-   struct r200_store backup_store;
+-
+-   /* Page flipping
+-    */
+-   GLuint doPageFlip;
+-
+-   /* Busy waiting
+-    */
+-   GLuint do_usleeps;
+-   GLuint do_irqs;
+-   GLuint irqsEmitted;
+-   drm_radeon_irq_wait_t iw;
++   struct radeon_ioctl ioctl;
++   struct radeon_store store;
+ 
+    /* Clientdata textures;
+     */
+    GLuint prefer_gart_client_texturing;
+ 
+-   /* Drawable, cliprect and scissor information
+-    */
+-   GLuint numClipRects;			/* Cliprects for the draw buffer */
+-   drm_clip_rect_t *pClipRects;
+-   unsigned int lastStamp;
+-   GLboolean lost_context;
+-   GLboolean save_on_next_emit;
+-   radeonScreenPtr r200Screen;	/* Screen private DRI data */
+-   drm_radeon_sarea_t *sarea;		/* Private SAREA data */
+-
+    /* TCL stuff
+     */
+    GLmatrix TexGenMatrix[R200_MAX_TEXTURE_UNITS];
+@@ -893,15 +609,6 @@ struct r200_context {
+    GLuint TexGenCompSel;
+    GLmatrix tmpmat;
+ 
+-   /* buffer swap
+-    */
+-   int64_t swap_ust;
+-   int64_t swap_missed_ust;
+-
+-   GLuint swap_count;
+-   GLuint swap_missed_count;
+-
+-
+    /* r200_tcl.c
+     */
+    struct r200_tcl_info tcl;
+@@ -910,14 +617,6 @@ struct r200_context {
+     */
+    struct r200_swtcl_info swtcl;
+ 
+-   /* Mirrors of some DRI state
+-    */
+-   struct r200_dri_mirror dri;
+-
+-   /* Configuration cache
+-    */
+-   driOptionCache optionCache;
+-
+    GLboolean using_hyperz;
+    GLboolean texmicrotile;
+ 
+@@ -927,28 +626,10 @@ struct r200_context {
+ #define R200_CONTEXT(ctx)		((r200ContextPtr)(ctx->DriverCtx))
+ 
+ 
+-static INLINE GLuint r200PackColor( GLuint cpp,
+-					GLubyte r, GLubyte g,
+-					GLubyte b, GLubyte a )
+-{
+-   switch ( cpp ) {
+-   case 2:
+-      return PACK_COLOR_565( r, g, b );
+-   case 4:
+-      return PACK_COLOR_8888( a, r, g, b );
+-   default:
+-      return 0;
+-   }
+-}
+-
+-
+ extern void r200DestroyContext( __DRIcontextPrivate *driContextPriv );
+ extern GLboolean r200CreateContext( const __GLcontextModes *glVisual,
+ 				    __DRIcontextPrivate *driContextPriv,
+ 				    void *sharedContextPrivate);
+-extern void r200SwapBuffers( __DRIdrawablePrivate *dPriv );
+-extern void r200CopySubBuffer( __DRIdrawablePrivate * dPriv,
+-			       int x, int y, int w, int h );
+ extern GLboolean r200MakeCurrent( __DRIcontextPrivate *driContextPriv,
+ 				  __DRIdrawablePrivate *driDrawPriv,
+ 				  __DRIdrawablePrivate *driReadPriv );
+@@ -957,28 +638,9 @@ extern GLboolean r200UnbindContext( __DRIcontextPrivate *driContextPriv );
+ /* ================================================================
+  * Debugging:
+  */
+-#define DO_DEBUG		1
+ 
+-#if DO_DEBUG
+-extern int R200_DEBUG;
+-#else
+-#define R200_DEBUG		0
+-#endif
++#define R200_DEBUG RADEON_DEBUG
++
+ 
+-#define DEBUG_TEXTURE	0x001
+-#define DEBUG_STATE	0x002
+-#define DEBUG_IOCTL	0x004
+-#define DEBUG_PRIMS	0x008
+-#define DEBUG_VERTS	0x010
+-#define DEBUG_FALLBACKS	0x020
+-#define DEBUG_VFMT	0x040
+-#define DEBUG_CODEGEN	0x080
+-#define DEBUG_VERBOSE	0x100
+-#define DEBUG_DRI       0x200
+-#define DEBUG_DMA       0x400
+-#define DEBUG_SANITY    0x800
+-#define DEBUG_SYNC      0x1000
+-#define DEBUG_PIXEL     0x2000
+-#define DEBUG_MEMORY    0x4000
+ 
+ #endif /* __R200_CONTEXT_H__ */
+diff --git a/src/mesa/drivers/dri/r200/r200_fragshader.c b/src/mesa/drivers/dri/r200/r200_fragshader.c
+index d514b28..85c1b7b 100644
+--- a/src/mesa/drivers/dri/r200/r200_fragshader.c
++++ b/src/mesa/drivers/dri/r200/r200_fragshader.c
+@@ -522,7 +522,7 @@ static void r200UpdateFSConstants( GLcontext *ctx )
+ 	 CLAMPED_FLOAT_TO_UBYTE(con_byte[2], ctx->ATIFragmentShader.GlobalConstants[i][2]);
+ 	 CLAMPED_FLOAT_TO_UBYTE(con_byte[3], ctx->ATIFragmentShader.GlobalConstants[i][3]);
+       }
+-      rmesa->hw.atf.cmd[ATF_TFACTOR_0 + i] = r200PackColor (
++      rmesa->hw.atf.cmd[ATF_TFACTOR_0 + i] = radeonPackColor (
+ 	 4, con_byte[0], con_byte[1], con_byte[2], con_byte[3] );
+    }
+ }
+diff --git a/src/mesa/drivers/dri/r200/r200_ioctl.c b/src/mesa/drivers/dri/r200/r200_ioctl.c
+index 0741e57..c08968f 100644
+--- a/src/mesa/drivers/dri/r200/r200_ioctl.c
++++ b/src/mesa/drivers/dri/r200/r200_ioctl.c
+@@ -41,6 +41,8 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ #include "main/context.h"
+ #include "swrast/swrast.h"
+ 
++#include "radeon_common.h"
++#include "radeon_lock.h"
+ #include "r200_context.h"
+ #include "r200_state.h"
+ #include "r200_ioctl.h"
+@@ -54,635 +56,28 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ #define R200_TIMEOUT             512
+ #define R200_IDLE_RETRY           16
+ 
+-
+-static void r200WaitForIdle( r200ContextPtr rmesa );
+-
+-
+-/* At this point we were in FlushCmdBufLocked but we had lost our context, so
+- * we need to unwire our current cmdbuf, hook the one with the saved state in
+- * it, flush it, and then put the current one back.  This is so commands at the
+- * start of a cmdbuf can rely on the state being kept from the previous one.
+- */
+-static void r200BackUpAndEmitLostStateLocked( r200ContextPtr rmesa )
+-{
+-   GLuint nr_released_bufs;
+-   struct r200_store saved_store;
+-
+-   if (rmesa->backup_store.cmd_used == 0)
+-      return;
+-
+-   if (R200_DEBUG & DEBUG_STATE)
+-      fprintf(stderr, "Emitting backup state on lost context\n");
+-
+-   rmesa->lost_context = GL_FALSE;
+-
+-   nr_released_bufs = rmesa->dma.nr_released_bufs;
+-   saved_store = rmesa->store;
+-   rmesa->dma.nr_released_bufs = 0;
+-   rmesa->store = rmesa->backup_store;
+-   r200FlushCmdBufLocked( rmesa, __FUNCTION__ );
+-   rmesa->dma.nr_released_bufs = nr_released_bufs;
+-   rmesa->store = saved_store;
+-}
+-
+-int r200FlushCmdBufLocked( r200ContextPtr rmesa, const char * caller )
+-{
+-   int ret, i;
+-   drm_radeon_cmd_buffer_t cmd;
+-
+-   if (rmesa->lost_context)
+-      r200BackUpAndEmitLostStateLocked( rmesa );
+-
+-   if (R200_DEBUG & DEBUG_IOCTL) {
+-      fprintf(stderr, "%s from %s\n", __FUNCTION__, caller); 
+-
+-      if (0 & R200_DEBUG & DEBUG_VERBOSE) 
+-	 for (i = 0 ; i < rmesa->store.cmd_used ; i += 4 )
+-	    fprintf(stderr, "%d: %x\n", i/4, 
+-		    *(int *)(&rmesa->store.cmd_buf[i]));
+-   }
+-
+-   if (R200_DEBUG & DEBUG_DMA)
+-      fprintf(stderr, "%s: Releasing %d buffers\n", __FUNCTION__,
+-	      rmesa->dma.nr_released_bufs);
+-
+-
+-   if (R200_DEBUG & DEBUG_SANITY) {
+-      if (rmesa->state.scissor.enabled) 
+-	 ret = r200SanityCmdBuffer( rmesa, 
+-				    rmesa->state.scissor.numClipRects,
+-				    rmesa->state.scissor.pClipRects);
+-      else
+-	 ret = r200SanityCmdBuffer( rmesa, 
+-				    rmesa->numClipRects,
+-				    rmesa->pClipRects);
+-      if (ret) {
+-	 fprintf(stderr, "drmSanityCommandWrite: %d\n", ret);	 
+-	 goto out;
+-      }
+-   }
+-
+-
+-   if (R200_DEBUG & DEBUG_MEMORY) {
+-      if (! driValidateTextureHeaps( rmesa->texture_heaps, rmesa->nr_heaps,
+-				     & rmesa->swapped ) ) {
+-	 fprintf( stderr, "%s: texture memory is inconsistent - expect "
+-		  "mangled textures\n", __FUNCTION__ );
+-      }
+-   }
+-
+-
+-   cmd.bufsz = rmesa->store.cmd_used;
+-   cmd.buf = rmesa->store.cmd_buf;
+-
+-   if (rmesa->state.scissor.enabled) {
+-      cmd.nbox = rmesa->state.scissor.numClipRects;
+-      cmd.boxes = (drm_clip_rect_t *)rmesa->state.scissor.pClipRects;
+-   } else {
+-      cmd.nbox = rmesa->numClipRects;
+-      cmd.boxes = (drm_clip_rect_t *)rmesa->pClipRects;
+-   }
+-
+-   ret = drmCommandWrite( rmesa->dri.fd,
+-			  DRM_RADEON_CMDBUF,
+-			  &cmd, sizeof(cmd) );
+-
+-   if (ret)
+-      fprintf(stderr, "drmCommandWrite: %d\n", ret);
+-
+-   if (R200_DEBUG & DEBUG_SYNC) {
+-      fprintf(stderr, "\nSyncing in %s\n\n", __FUNCTION__);
+-      r200WaitForIdleLocked( rmesa );
+-   }
+-
+-
+- out:
+-   rmesa->store.primnr = 0;
+-   rmesa->store.statenr = 0;
+-   rmesa->store.cmd_used = 0;
+-   rmesa->dma.nr_released_bufs = 0;
+-   rmesa->save_on_next_emit = 1;
+-
+-   return ret;
+-}
+-
+-
+-/* Note: does not emit any commands to avoid recursion on
+- * r200AllocCmdBuf.
+- */
+-void r200FlushCmdBuf( r200ContextPtr rmesa, const char *caller )
+-{
+-   int ret;
+-
+-   LOCK_HARDWARE( rmesa );
+-
+-   ret = r200FlushCmdBufLocked( rmesa, caller );
+-
+-   UNLOCK_HARDWARE( rmesa );
+-
+-   if (ret) {
+-      fprintf(stderr, "drmRadeonCmdBuffer: %d (exiting)\n", ret);
+-      exit(ret);
+-   }
+-}
+-
+-
+-/* =============================================================
+- * Hardware vertex buffer handling
+- */
+-
+-
+-void r200RefillCurrentDmaRegion( r200ContextPtr rmesa )
+-{
+-   struct r200_dma_buffer *dmabuf;
+-   int fd = rmesa->dri.fd;
+-   int index = 0;
+-   int size = 0;
+-   drmDMAReq dma;
+-   int ret;
+-
+-   if (R200_DEBUG & (DEBUG_IOCTL|DEBUG_DMA))
+-      fprintf(stderr, "%s\n", __FUNCTION__);  
+-
+-   if (rmesa->dma.flush) {
+-      rmesa->dma.flush( rmesa );
+-   }
+-
+-   if (rmesa->dma.current.buf)
+-      r200ReleaseDmaRegion( rmesa, &rmesa->dma.current, __FUNCTION__ );
+-
+-   if (rmesa->dma.nr_released_bufs > 4)
+-      r200FlushCmdBuf( rmesa, __FUNCTION__ );
+-
+-   dma.context = rmesa->dri.hwContext;
+-   dma.send_count = 0;
+-   dma.send_list = NULL;
+-   dma.send_sizes = NULL;
+-   dma.flags = 0;
+-   dma.request_count = 1;
+-   dma.request_size = RADEON_BUFFER_SIZE;
+-   dma.request_list = &index;
+-   dma.request_sizes = &size;
+-   dma.granted_count = 0;
+-
+-   LOCK_HARDWARE(rmesa);	/* no need to validate */
+-
+-   while (1) {
+-      ret = drmDMA( fd, &dma );
+-      if (ret == 0)
+-	 break;
+-   
+-      if (rmesa->dma.nr_released_bufs) {
+-	 r200FlushCmdBufLocked( rmesa, __FUNCTION__ );
+-      }
+-
+-      if (rmesa->do_usleeps) {
+-	 UNLOCK_HARDWARE( rmesa );
+-	 DO_USLEEP( 1 );
+-	 LOCK_HARDWARE( rmesa );
+-      }
+-   }
+-
+-   UNLOCK_HARDWARE(rmesa);
+-
+-   if (R200_DEBUG & DEBUG_DMA)
+-      fprintf(stderr, "Allocated buffer %d\n", index);
+-
+-   dmabuf = CALLOC_STRUCT( r200_dma_buffer );
+-   dmabuf->buf = &rmesa->r200Screen->buffers->list[index];
+-   dmabuf->refcount = 1;
+-
+-   rmesa->dma.current.buf = dmabuf;
+-   rmesa->dma.current.address = dmabuf->buf->address;
+-   rmesa->dma.current.end = dmabuf->buf->total;
+-   rmesa->dma.current.start = 0;
+-   rmesa->dma.current.ptr = 0;
+-}
+-
+-void r200ReleaseDmaRegion( r200ContextPtr rmesa,
+-			     struct r200_dma_region *region,
+-			     const char *caller )
+-{
+-   if (R200_DEBUG & DEBUG_IOCTL)
+-      fprintf(stderr, "%s from %s\n", __FUNCTION__, caller); 
+-   
+-   if (!region->buf)
+-      return;
+-
+-   if (rmesa->dma.flush)
+-      rmesa->dma.flush( rmesa );
+-
+-   if (--region->buf->refcount == 0) {
+-      drm_radeon_cmd_header_t *cmd;
+-
+-      if (R200_DEBUG & (DEBUG_IOCTL|DEBUG_DMA))
+-	 fprintf(stderr, "%s -- DISCARD BUF %d\n", __FUNCTION__,
+-		 region->buf->buf->idx);  
+-      
+-      cmd = (drm_radeon_cmd_header_t *)r200AllocCmdBuf( rmesa, sizeof(*cmd), 
+-						     __FUNCTION__ );
+-      cmd->dma.cmd_type = RADEON_CMD_DMA_DISCARD;
+-      cmd->dma.buf_idx = region->buf->buf->idx;
+-      FREE(region->buf);
+-      rmesa->dma.nr_released_bufs++;
+-   }
+-
+-   region->buf = NULL;
+-   region->start = 0;
+-}
+-
+-/* Allocates a region from rmesa->dma.current.  If there isn't enough
+- * space in current, grab a new buffer (and discard what was left of current)
+- */
+-void r200AllocDmaRegion( r200ContextPtr rmesa, 
+-			   struct r200_dma_region *region,
+-			   int bytes,
+-			   int alignment )
+-{
+-   if (R200_DEBUG & DEBUG_IOCTL)
+-      fprintf(stderr, "%s %d\n", __FUNCTION__, bytes);
+-
+-   if (rmesa->dma.flush)
+-      rmesa->dma.flush( rmesa );
+-
+-   if (region->buf)
+-      r200ReleaseDmaRegion( rmesa, region, __FUNCTION__ );
+-
+-   alignment--;
+-   rmesa->dma.current.start = rmesa->dma.current.ptr = 
+-      (rmesa->dma.current.ptr + alignment) & ~alignment;
+-
+-   if ( rmesa->dma.current.ptr + bytes > rmesa->dma.current.end ) 
+-      r200RefillCurrentDmaRegion( rmesa );
+-
+-   region->start = rmesa->dma.current.start;
+-   region->ptr = rmesa->dma.current.start;
+-   region->end = rmesa->dma.current.start + bytes;
+-   region->address = rmesa->dma.current.address;
+-   region->buf = rmesa->dma.current.buf;
+-   region->buf->refcount++;
+-
+-   rmesa->dma.current.ptr += bytes; /* bug - if alignment > 7 */
+-   rmesa->dma.current.start = 
+-      rmesa->dma.current.ptr = (rmesa->dma.current.ptr + 0x7) & ~0x7;  
+-
+-   assert( rmesa->dma.current.ptr <= rmesa->dma.current.end );
+-}
+-
+-/* ================================================================
+- * SwapBuffers with client-side throttling
+- */
+-
+-static uint32_t r200GetLastFrame(r200ContextPtr rmesa)
+-{
+-   drm_radeon_getparam_t gp;
+-   int ret;
+-   uint32_t frame;
+-
+-   gp.param = RADEON_PARAM_LAST_FRAME;
+-   gp.value = (int *)&frame;
+-   ret = drmCommandWriteRead( rmesa->dri.fd, DRM_RADEON_GETPARAM,
+-			      &gp, sizeof(gp) );
+-   if ( ret ) {
+-      fprintf( stderr, "%s: drmRadeonGetParam: %d\n", __FUNCTION__, ret );
+-      exit(1);
+-   }
+-
+-   return frame;
+-}
+-
+-static void r200EmitIrqLocked( r200ContextPtr rmesa )
+-{
+-   drm_radeon_irq_emit_t ie;
+-   int ret;
+-
+-   ie.irq_seq = &rmesa->iw.irq_seq;
+-   ret = drmCommandWriteRead( rmesa->dri.fd, DRM_RADEON_IRQ_EMIT, 
+-			      &ie, sizeof(ie) );
+-   if ( ret ) {
+-      fprintf( stderr, "%s: drmRadeonIrqEmit: %d\n", __FUNCTION__, ret );
+-      exit(1);
+-   }
+-}
+-
+-
+-static void r200WaitIrq( r200ContextPtr rmesa )
+-{
+-   int ret;
+-
+-   do {
+-      ret = drmCommandWrite( rmesa->dri.fd, DRM_RADEON_IRQ_WAIT,
+-			     &rmesa->iw, sizeof(rmesa->iw) );
+-   } while (ret && (errno == EINTR || errno == EBUSY));
+-
+-   if ( ret ) {
+-      fprintf( stderr, "%s: drmRadeonIrqWait: %d\n", __FUNCTION__, ret );
+-      exit(1);
+-   }
+-}
+-
+-
+-static void r200WaitForFrameCompletion( r200ContextPtr rmesa )
+-{
+-   drm_radeon_sarea_t *sarea = rmesa->sarea;
+-
+-   if (rmesa->do_irqs) {
+-      if (r200GetLastFrame(rmesa) < sarea->last_frame) {
+-	 if (!rmesa->irqsEmitted) {
+-	    while (r200GetLastFrame (rmesa) < sarea->last_frame)
+-	       ;
+-	 }
+-	 else {
+-	    UNLOCK_HARDWARE( rmesa ); 
+-	    r200WaitIrq( rmesa );	
+-	    LOCK_HARDWARE( rmesa ); 
+-	 }
+-	 rmesa->irqsEmitted = 10;
+-      }
+-
+-      if (rmesa->irqsEmitted) {
+-	 r200EmitIrqLocked( rmesa );
+-	 rmesa->irqsEmitted--;
+-      }
+-   } 
+-   else {
+-      while (r200GetLastFrame (rmesa) < sarea->last_frame) {
+-	 UNLOCK_HARDWARE( rmesa ); 
+-	 if (rmesa->do_usleeps) 
+-	    DO_USLEEP( 1 );
+-	 LOCK_HARDWARE( rmesa ); 
+-      }
+-   }
+-}
+-
+-
+-
+-/* Copy the back color buffer to the front color buffer.
+- */
+-void r200CopyBuffer( __DRIdrawablePrivate *dPriv,
+-		      const drm_clip_rect_t	 *rect)
+-{
+-   r200ContextPtr rmesa;
+-   GLint nbox, i, ret;
+-   GLboolean   missed_target;
+-   int64_t ust;
+-   __DRIscreenPrivate *psp = dPriv->driScreenPriv;
+-
+-   assert(dPriv);
+-   assert(dPriv->driContextPriv);
+-   assert(dPriv->driContextPriv->driverPrivate);
+-
+-   rmesa = (r200ContextPtr) dPriv->driContextPriv->driverPrivate;
+-
+-   if ( R200_DEBUG & DEBUG_IOCTL ) {
+-      fprintf( stderr, "\n%s( %p )\n\n", __FUNCTION__, (void *)rmesa->glCtx );
+-   }
+-
+-   R200_FIREVERTICES( rmesa );
+-
+-   LOCK_HARDWARE( rmesa );
+-
+-
+-   /* Throttle the frame rate -- only allow one pending swap buffers
+-    * request at a time.
+-    */
+-   r200WaitForFrameCompletion( rmesa );
+-   if (!rect)
+-   {
+-       UNLOCK_HARDWARE( rmesa );
+-       driWaitForVBlank( dPriv, & missed_target );
+-       LOCK_HARDWARE( rmesa );
+-   }
+-
+-   nbox = dPriv->numClipRects; /* must be in locked region */
+-
+-   for ( i = 0 ; i < nbox ; ) {
+-      GLint nr = MIN2( i + RADEON_NR_SAREA_CLIPRECTS , nbox );
+-      drm_clip_rect_t *box = dPriv->pClipRects;
+-      drm_clip_rect_t *b = rmesa->sarea->boxes;
+-      GLint n = 0;
+-
+-      for ( ; i < nr ; i++ ) {
+-
+-	  *b = box[i];
+-
+-	  if (rect)
+-	  {
+-	     if (rect->x1 > b->x1)
+-		 b->x1 = rect->x1;
+-	     if (rect->y1 > b->y1)
+-		 b->y1 = rect->y1;
+-	     if (rect->x2 < b->x2)
+-		 b->x2 = rect->x2;
+-	     if (rect->y2 < b->y2)
+-		 b->y2 = rect->y2;
+-
+-	     if (b->x1 >= b->x2 || b->y1 >= b->y2)
+-		 continue;
+-	  }
+-
+-	  b++;
+-	  n++;
+-      }
+-      rmesa->sarea->nbox = n;
+-
+-      if (!n)
+-	 continue;
+-
+-      ret = drmCommandNone( rmesa->dri.fd, DRM_RADEON_SWAP );
+-
+-      if ( ret ) {
+-	 fprintf( stderr, "DRM_R200_SWAP_BUFFERS: return = %d\n", ret );
+-	 UNLOCK_HARDWARE( rmesa );
+-	 exit( 1 );
+-      }
+-   }
+-
+-   UNLOCK_HARDWARE( rmesa );
+-   if (!rect)
+-   {
+-       rmesa->hw.all_dirty = GL_TRUE;
+-
+-       rmesa->swap_count++;
+-       (*psp->systemTime->getUST)( & ust );
+-       if ( missed_target ) {
+-	   rmesa->swap_missed_count++;
+-	   rmesa->swap_missed_ust = ust - rmesa->swap_ust;
+-       }
+-
+-       rmesa->swap_ust = ust;
+-
+-       sched_yield();
+-   }
+-}
+-
+-void r200PageFlip( __DRIdrawablePrivate *dPriv )
++static void r200UserClear(GLcontext *ctx, GLuint flags)
+ {
+-   r200ContextPtr rmesa;
+-   GLint ret;
+-   GLboolean   missed_target;
+-   __DRIscreenPrivate *psp = dPriv->driScreenPriv;
+-
+-   assert(dPriv);
+-   assert(dPriv->driContextPriv);
+-   assert(dPriv->driContextPriv->driverPrivate);
++   if (flags & (RADEON_FRONT | RADEON_BACK)) {
+ 
+-   rmesa = (r200ContextPtr) dPriv->driContextPriv->driverPrivate;
+ 
+-   if ( R200_DEBUG & DEBUG_IOCTL ) {
+-      fprintf(stderr, "%s: pfCurrentPage: %d\n", __FUNCTION__,
+-	      rmesa->sarea->pfCurrentPage);
+-   }
+-
+-   R200_FIREVERTICES( rmesa );
+-   LOCK_HARDWARE( rmesa );
+-
+-   if (!dPriv->numClipRects) {
+-      UNLOCK_HARDWARE( rmesa );
+-      usleep( 10000 );		/* throttle invisible client 10ms */
+-      return;
+    }
++	  
++   if ((flags & (RADEON_DEPTH | RADEON_STENCIL))
++       && (flags & RADEON_CLEAR_FASTZ)) {
+ 
+-   /* Need to do this for the perf box placement:
+-    */
+-   {
+-      drm_clip_rect_t *box = dPriv->pClipRects;
+-      drm_clip_rect_t *b = rmesa->sarea->boxes;
+-      b[0] = box[0];
+-      rmesa->sarea->nbox = 1;
+-   }
+-
+-   /* Throttle the frame rate -- only allow a few pending swap buffers
+-    * request at a time.
+-    */
+-   r200WaitForFrameCompletion( rmesa );
+-   UNLOCK_HARDWARE( rmesa );
+-   driWaitForVBlank( dPriv, & missed_target );
+-   if ( missed_target ) {
+-      rmesa->swap_missed_count++;
+-      (void) (*psp->systemTime->getUST)( & rmesa->swap_missed_ust );
+    }
+-   LOCK_HARDWARE( rmesa );
+ 
+-   ret = drmCommandNone( rmesa->dri.fd, DRM_RADEON_FLIP );
+-
+-   UNLOCK_HARDWARE( rmesa );
+-
+-   if ( ret ) {
+-      fprintf( stderr, "DRM_RADEON_FLIP: return = %d\n", ret );
+-      exit( 1 );
+-   }
+-
+-   rmesa->swap_count++;
+-   (void) (*psp->systemTime->getUST)( & rmesa->swap_ust );
+-
+-#if 000
+-   if ( rmesa->sarea->pfCurrentPage == 1 ) {
+-	 rmesa->state.color.drawOffset = rmesa->r200Screen->frontOffset;
+-	 rmesa->state.color.drawPitch  = rmesa->r200Screen->frontPitch;
+-   } else {
+-	 rmesa->state.color.drawOffset = rmesa->r200Screen->backOffset;
+-	 rmesa->state.color.drawPitch  = rmesa->r200Screen->backPitch;
+-   }
+-
+-   R200_STATECHANGE( rmesa, ctx );
+-   rmesa->hw.ctx.cmd[CTX_RB3D_COLOROFFSET] = rmesa->state.color.drawOffset
+-					   + rmesa->r200Screen->fbLocation;
+-   rmesa->hw.ctx.cmd[CTX_RB3D_COLORPITCH]  = rmesa->state.color.drawPitch;
+-   if (rmesa->sarea->tiling_enabled) {
+-      rmesa->hw.ctx.cmd[CTX_RB3D_COLORPITCH] |= R200_COLOR_TILE_ENABLE;
+-   }
+-#else
+-   /* Get ready for drawing next frame.  Update the renderbuffers'
+-    * flippedOffset/Pitch fields so we draw into the right place.
+-    */
+-   driFlipRenderbuffers(rmesa->glCtx->WinSysDrawBuffer,
+-                        rmesa->sarea->pfCurrentPage);
+-
+-
+-   r200UpdateDrawBuffer(rmesa->glCtx);
+-#endif
+ }
+ 
+-
+-/* ================================================================
+- * Buffer clear
+- */
+-static void r200Clear( GLcontext *ctx, GLbitfield mask )
++static void r200KernelClear(GLcontext *ctx, GLuint flags)
+ {
+    r200ContextPtr rmesa = R200_CONTEXT(ctx);
+-   __DRIdrawablePrivate *dPriv = rmesa->dri.drawable;
+-   GLuint flags = 0;
+-   GLuint color_mask = 0;
+-   GLint ret, i;
+-   GLint cx, cy, cw, ch;
++   __DRIdrawablePrivate *dPriv = rmesa->radeon.dri.drawable;
++   GLint cx, cy, cw, ch, ret;
++   GLuint i;
+ 
+-   if ( R200_DEBUG & DEBUG_IOCTL ) {
+-      fprintf( stderr, "r200Clear\n");
+-   }
+-
+-   {
+-      LOCK_HARDWARE( rmesa );
+-      UNLOCK_HARDWARE( rmesa );
+-      if ( dPriv->numClipRects == 0 ) 
+-	 return;
+-   }
+-
+-   r200Flush( ctx );
+-
+-   if ( mask & BUFFER_BIT_FRONT_LEFT ) {
+-      flags |= RADEON_FRONT;
+-      color_mask = rmesa->hw.msk.cmd[MSK_RB3D_PLANEMASK];
+-      mask &= ~BUFFER_BIT_FRONT_LEFT;
+-   }
+-
+-   if ( mask & BUFFER_BIT_BACK_LEFT ) {
+-      flags |= RADEON_BACK;
+-      color_mask = rmesa->hw.msk.cmd[MSK_RB3D_PLANEMASK];
+-      mask &= ~BUFFER_BIT_BACK_LEFT;
+-   }
+-
+-   if ( mask & BUFFER_BIT_DEPTH ) {
+-      flags |= RADEON_DEPTH;
+-      mask &= ~BUFFER_BIT_DEPTH;
+-   }
+-
+-   if ( (mask & BUFFER_BIT_STENCIL) && rmesa->state.stencil.hwBuffer ) {
+-      flags |= RADEON_STENCIL;
+-      mask &= ~BUFFER_BIT_STENCIL;
+-   }
+-
+-   if ( mask ) {
+-      if (R200_DEBUG & DEBUG_FALLBACKS)
+-	 fprintf(stderr, "%s: swrast clear, mask: %x\n", __FUNCTION__, mask);
+-      _swrast_Clear( ctx, mask );
+-   }
+-
+-   if ( !flags ) 
+-      return;
+-
+-   if (rmesa->using_hyperz) {
+-      flags |= RADEON_USE_COMP_ZBUF;
+-/*      if (rmesa->r200Screen->chip_family == CHIP_FAMILY_R200)
+-	 flags |= RADEON_USE_HIERZ; */
+-      if (!(rmesa->state.stencil.hwBuffer) ||
+-	 ((flags & RADEON_DEPTH) && (flags & RADEON_STENCIL) &&
+-	    ((rmesa->state.stencil.clear & R200_STENCIL_WRITE_MASK) == R200_STENCIL_WRITE_MASK))) {
+-	  flags |= RADEON_CLEAR_FASTZ;
+-      }
+-   }
+-
+-   LOCK_HARDWARE( rmesa );
+-
+-   /* compute region after locking: */
+-   cx = ctx->DrawBuffer->_Xmin;
+-   cy = ctx->DrawBuffer->_Ymin;
+-   cw = ctx->DrawBuffer->_Xmax - cx;
+-   ch = ctx->DrawBuffer->_Ymax - cy;
+-
+-   /* Flip top to bottom */
+-   cx += dPriv->x;
+-   cy  = dPriv->y + dPriv->h - cy - ch;
++   LOCK_HARDWARE( &rmesa->radeon );
+ 
+    /* Throttle the number of clear ioctls we do.
+     */
+@@ -693,7 +88,7 @@ static void r200Clear( GLcontext *ctx, GLbitfield mask )
+ 
+       gp.param = RADEON_PARAM_LAST_CLEAR;
+       gp.value = (int *)&clear;
+-      ret = drmCommandWriteRead( rmesa->dri.fd,
++      ret = drmCommandWriteRead( rmesa->radeon.dri.fd,
+ 		      DRM_RADEON_GETPARAM, &gp, sizeof(gp) );
+ 
+       if ( ret ) {
+@@ -703,24 +98,34 @@ static void r200Clear( GLcontext *ctx, GLbitfield mask )
+ 
+       /* Clear throttling needs more thought.
+        */
+-      if ( rmesa->sarea->last_clear - clear <= 25 ) {
++      if ( rmesa->radeon.sarea->last_clear - clear <= 25 ) {
+ 	 break;
+       }
+       
+-      if (rmesa->do_usleeps) {
+-	 UNLOCK_HARDWARE( rmesa );
++      if (rmesa->radeon.do_usleeps) {
++	 UNLOCK_HARDWARE( &rmesa->radeon );
+ 	 DO_USLEEP( 1 );
+-	 LOCK_HARDWARE( rmesa );
++	 LOCK_HARDWARE( &rmesa->radeon );
+       }
+    }
+ 
+    /* Send current state to the hardware */
+-   r200FlushCmdBufLocked( rmesa, __FUNCTION__ );
++   rcommonFlushCmdBufLocked( &rmesa->radeon, __FUNCTION__ );
++
++
++  /* compute region after locking: */
++   cx = ctx->DrawBuffer->_Xmin;
++   cy = ctx->DrawBuffer->_Ymin;
++   cw = ctx->DrawBuffer->_Xmax - cx;
++   ch = ctx->DrawBuffer->_Ymax - cy;
+ 
++   /* Flip top to bottom */
++   cx += dPriv->x;
++   cy  = dPriv->y + dPriv->h - cy - ch;
+    for ( i = 0 ; i < dPriv->numClipRects ; ) {
+       GLint nr = MIN2( i + RADEON_NR_SAREA_CLIPRECTS, dPriv->numClipRects );
+       drm_clip_rect_t *box = dPriv->pClipRects;
+-      drm_clip_rect_t *b = rmesa->sarea->boxes;
++      drm_clip_rect_t *b = rmesa->radeon.sarea->boxes;
+       drm_radeon_clear_t clear;
+       drm_radeon_clear_rect_t depth_boxes[RADEON_NR_SAREA_CLIPRECTS];
+       GLint n = 0;
+@@ -755,17 +160,17 @@ static void r200Clear( GLcontext *ctx, GLbitfield mask )
+ 	 }
+       }
+ 
+-      rmesa->sarea->nbox = n;
++      rmesa->radeon.sarea->nbox = n;
+ 
+       clear.flags       = flags;
+-      clear.clear_color = rmesa->state.color.clear;
+-      clear.clear_depth = rmesa->state.depth.clear;	/* needed for hyperz */
++      clear.clear_color = rmesa->radeon.state.color.clear;
++      clear.clear_depth = rmesa->radeon.state.depth.clear;	/* needed for hyperz */
+       clear.color_mask  = rmesa->hw.msk.cmd[MSK_RB3D_PLANEMASK];
+-      clear.depth_mask  = rmesa->state.stencil.clear;
++      clear.depth_mask  = rmesa->radeon.state.stencil.clear;
+       clear.depth_boxes = depth_boxes;
+ 
+       n--;
+-      b = rmesa->sarea->boxes;
++      b = rmesa->radeon.sarea->boxes;
+       for ( ; n >= 0 ; n-- ) {
+ 	 depth_boxes[n].f[CLEAR_X1] = (float)b[n].x1;
+ 	 depth_boxes[n].f[CLEAR_Y1] = (float)b[n].y1;
+@@ -774,83 +179,91 @@ static void r200Clear( GLcontext *ctx, GLbitfield mask )
+ 	 depth_boxes[n].f[CLEAR_DEPTH] = ctx->Depth.Clear;
+       }
+ 
+-      ret = drmCommandWrite( rmesa->dri.fd, DRM_RADEON_CLEAR,
++      ret = drmCommandWrite( rmesa->radeon.dri.fd, DRM_RADEON_CLEAR,
+ 			     &clear, sizeof(clear));
+ 
+ 
+       if ( ret ) {
+-	 UNLOCK_HARDWARE( rmesa );
++	 UNLOCK_HARDWARE( &rmesa->radeon );
+ 	 fprintf( stderr, "DRM_RADEON_CLEAR: return = %d\n", ret );
+ 	 exit( 1 );
+       }
+    }
+-
+-   UNLOCK_HARDWARE( rmesa );
+-   rmesa->hw.all_dirty = GL_TRUE;
++   UNLOCK_HARDWARE( &rmesa->radeon );
+ }
+-
+-
+-void r200WaitForIdleLocked( r200ContextPtr rmesa )
++/* ================================================================
++ * Buffer clear
++ */
++static void r200Clear( GLcontext *ctx, GLbitfield mask )
+ {
+-    int ret;
+-    int i = 0;
+-    
+-    do {
+-       ret = drmCommandNone( rmesa->dri.fd, DRM_RADEON_CP_IDLE);
+-       if (ret) 
+-	  DO_USLEEP( 1 );
+-    } while (ret && ++i < 100);
+-    
+-    if ( ret < 0 ) {
+-       UNLOCK_HARDWARE( rmesa );
+-       fprintf( stderr, "Error: R200 timed out... exiting\n" );
+-       exit( -1 );
+-    }
+-}
++   r200ContextPtr rmesa = R200_CONTEXT(ctx);
++   __DRIdrawablePrivate *dPriv = rmesa->radeon.dri.drawable;
++   GLuint flags = 0;
++   GLuint color_mask = 0;
++   GLint ret;
+ 
++   if ( R200_DEBUG & DEBUG_IOCTL ) {
++      fprintf( stderr, "r200Clear\n");
++   }
+ 
+-static void r200WaitForIdle( r200ContextPtr rmesa )
+-{
+-   LOCK_HARDWARE(rmesa);
+-   r200WaitForIdleLocked( rmesa );
+-   UNLOCK_HARDWARE(rmesa);
+-}
++   {
++      LOCK_HARDWARE( &rmesa->radeon );
++      UNLOCK_HARDWARE( &rmesa->radeon );
++      if ( dPriv->numClipRects == 0 ) 
++	 return;
++   }
+ 
++   radeonFlush( ctx );
+ 
+-void r200Flush( GLcontext *ctx )
+-{
+-   r200ContextPtr rmesa = R200_CONTEXT( ctx );
++   if ( mask & BUFFER_BIT_FRONT_LEFT ) {
++      flags |= RADEON_FRONT;
++      color_mask = rmesa->hw.msk.cmd[MSK_RB3D_PLANEMASK];
++      mask &= ~BUFFER_BIT_FRONT_LEFT;
++   }
+ 
+-   if (R200_DEBUG & DEBUG_IOCTL)
+-      fprintf(stderr, "%s\n", __FUNCTION__);
++   if ( mask & BUFFER_BIT_BACK_LEFT ) {
++      flags |= RADEON_BACK;
++      color_mask = rmesa->hw.msk.cmd[MSK_RB3D_PLANEMASK];
++      mask &= ~BUFFER_BIT_BACK_LEFT;
++   }
+ 
+-   if (rmesa->dma.flush)
+-      rmesa->dma.flush( rmesa );
++   if ( mask & BUFFER_BIT_DEPTH ) {
++      flags |= RADEON_DEPTH;
++      mask &= ~BUFFER_BIT_DEPTH;
++   }
+ 
+-   r200EmitState( rmesa );
+-   
+-   if (rmesa->store.cmd_used)
+-      r200FlushCmdBuf( rmesa, __FUNCTION__ );
+-}
++   if ( (mask & BUFFER_BIT_STENCIL) && rmesa->radeon.state.stencil.hwBuffer ) {
++      flags |= RADEON_STENCIL;
++      mask &= ~BUFFER_BIT_STENCIL;
++   }
+ 
+-/* Make sure all commands have been sent to the hardware and have
+- * completed processing.
+- */
+-void r200Finish( GLcontext *ctx )
+-{
+-   r200ContextPtr rmesa = R200_CONTEXT(ctx);
+-   r200Flush( ctx );
++   if ( mask ) {
++      if (R200_DEBUG & DEBUG_FALLBACKS)
++	 fprintf(stderr, "%s: swrast clear, mask: %x\n", __FUNCTION__, mask);
++      _swrast_Clear( ctx, mask );
++   }
++
++   if ( !flags ) 
++      return;
+ 
+-   if (rmesa->do_irqs) {
+-      LOCK_HARDWARE( rmesa );
+-      r200EmitIrqLocked( rmesa );
+-      UNLOCK_HARDWARE( rmesa );
+-      r200WaitIrq( rmesa );
++   if (rmesa->using_hyperz) {
++      flags |= RADEON_USE_COMP_ZBUF;
++/*      if (rmesa->radeon.radeonScreen->chip_family == CHIP_FAMILY_R200)
++	 flags |= RADEON_USE_HIERZ; */
++      if (!(rmesa->radeon.state.stencil.hwBuffer) ||
++	 ((flags & RADEON_DEPTH) && (flags & RADEON_STENCIL) &&
++	    ((rmesa->radeon.state.stencil.clear & R200_STENCIL_WRITE_MASK) == R200_STENCIL_WRITE_MASK))) {
++	  flags |= RADEON_CLEAR_FASTZ;
++      }
+    }
+-   else 
+-      r200WaitForIdle( rmesa );
+-}
+ 
++   if (rmesa->radeon.radeonScreen->kernel_mm)
++      r200UserClear(ctx, flags);
++   else
++      r200KernelClear(ctx, flags);
++
++   rmesa->radeon.hw.all_dirty = GL_TRUE;
++}
+ 
+ /* This version of AllocateMemoryMESA allocates only GART memory, and
+  * only does so after the point at which the driver has been
+@@ -875,7 +288,7 @@ void *r200AllocateMemoryMESA(__DRIscreen *screen, GLsizei size,
+       fprintf(stderr, "%s sz %d %f/%f/%f\n", __FUNCTION__, size, readfreq, 
+ 	      writefreq, priority);
+ 
+-   if (!ctx || !(rmesa = R200_CONTEXT(ctx)) || !rmesa->r200Screen->gartTextures.map)
++   if (!ctx || !(rmesa = R200_CONTEXT(ctx)) || !rmesa->radeon.radeonScreen->gartTextures.map)
+       return NULL;
+ 
+    if (getenv("R200_NO_ALLOC"))
+@@ -886,7 +299,7 @@ void *r200AllocateMemoryMESA(__DRIscreen *screen, GLsizei size,
+    alloc.size = size;
+    alloc.region_offset = &region_offset;
+ 
+-   ret = drmCommandWriteRead( rmesa->r200Screen->driScreen->fd,
++   ret = drmCommandWriteRead( rmesa->radeon.radeonScreen->driScreen->fd,
+ 			      DRM_RADEON_ALLOC,
+ 			      &alloc, sizeof(alloc));
+    
+@@ -896,7 +309,7 @@ void *r200AllocateMemoryMESA(__DRIscreen *screen, GLsizei size,
+    }
+    
+    {
+-      char *region_start = (char *)rmesa->r200Screen->gartTextures.map;
++      char *region_start = (char *)rmesa->radeon.radeonScreen->gartTextures.map;
+       return (void *)(region_start + region_offset);
+    }
+ }
+@@ -914,24 +327,24 @@ void r200FreeMemoryMESA(__DRIscreen *screen, GLvoid *pointer)
+    if (R200_DEBUG & DEBUG_IOCTL)
+       fprintf(stderr, "%s %p\n", __FUNCTION__, pointer);
+ 
+-   if (!ctx || !(rmesa = R200_CONTEXT(ctx)) || !rmesa->r200Screen->gartTextures.map) {
++   if (!ctx || !(rmesa = R200_CONTEXT(ctx)) || !rmesa->radeon.radeonScreen->gartTextures.map) {
+       fprintf(stderr, "%s: no context\n", __FUNCTION__);
+       return;
+    }
+ 
+-   region_offset = (char *)pointer - (char *)rmesa->r200Screen->gartTextures.map;
++   region_offset = (char *)pointer - (char *)rmesa->radeon.radeonScreen->gartTextures.map;
+ 
+    if (region_offset < 0 || 
+-       region_offset > rmesa->r200Screen->gartTextures.size) {
++       region_offset > rmesa->radeon.radeonScreen->gartTextures.size) {
+       fprintf(stderr, "offset %d outside range 0..%d\n", region_offset,
+-	      rmesa->r200Screen->gartTextures.size);
++	      rmesa->radeon.radeonScreen->gartTextures.size);
+       return;
+    }
+ 
+    memfree.region = RADEON_MEM_REGION_GART;
+    memfree.region_offset = region_offset;
+    
+-   ret = drmCommandWrite( rmesa->r200Screen->driScreen->fd,
++   ret = drmCommandWrite( rmesa->radeon.radeonScreen->driScreen->fd,
+ 			  DRM_RADEON_FREE,
+ 			  &memfree, sizeof(memfree));
+    
+@@ -956,16 +369,16 @@ GLuint r200GetMemoryOffsetMESA(__DRIscreen *screen, const GLvoid *pointer)
+ 
+    card_offset = r200GartOffsetFromVirtual( rmesa, pointer );
+ 
+-   return card_offset - rmesa->r200Screen->gart_base;
++   return card_offset - rmesa->radeon.radeonScreen->gart_base;
+ }
+ 
+ GLboolean r200IsGartMemory( r200ContextPtr rmesa, const GLvoid *pointer,
+ 			   GLint size )
+ {
+-   ptrdiff_t offset = (char *)pointer - (char *)rmesa->r200Screen->gartTextures.map;
++   ptrdiff_t offset = (char *)pointer - (char *)rmesa->radeon.radeonScreen->gartTextures.map;
+    int valid = (size >= 0 &&
+ 		offset >= 0 &&
+-		offset + size < rmesa->r200Screen->gartTextures.size);
++		offset + size < rmesa->radeon.radeonScreen->gartTextures.size);
+ 
+    if (R200_DEBUG & DEBUG_IOCTL)
+       fprintf(stderr, "r200IsGartMemory( %p ) : %d\n", pointer, valid );
+@@ -976,12 +389,12 @@ GLboolean r200IsGartMemory( r200ContextPtr rmesa, const GLvoid *pointer,
+ 
+ GLuint r200GartOffsetFromVirtual( r200ContextPtr rmesa, const GLvoid *pointer )
+ {
+-   ptrdiff_t offset = (char *)pointer - (char *)rmesa->r200Screen->gartTextures.map;
++   ptrdiff_t offset = (char *)pointer - (char *)rmesa->radeon.radeonScreen->gartTextures.map;
+ 
+-   if (offset < 0 || offset > rmesa->r200Screen->gartTextures.size)
++   if (offset < 0 || offset > rmesa->radeon.radeonScreen->gartTextures.size)
+       return ~0;
+    else
+-      return rmesa->r200Screen->gart_texture_offset + offset;
++      return rmesa->radeon.radeonScreen->gart_texture_offset + offset;
+ }
+ 
+ 
+@@ -989,7 +402,7 @@ GLuint r200GartOffsetFromVirtual( r200ContextPtr rmesa, const GLvoid *pointer )
+ void r200InitIoctlFuncs( struct dd_function_table *functions )
+ {
+     functions->Clear = r200Clear;
+-    functions->Finish = r200Finish;
+-    functions->Flush = r200Flush;
++    functions->Finish = radeonFinish;
++    functions->Flush = radeonFlush;
+ }
+ 
+diff --git a/src/mesa/drivers/dri/r200/r200_ioctl.h b/src/mesa/drivers/dri/r200/r200_ioctl.h
+index f7458e4..2a4b8a1 100644
+--- a/src/mesa/drivers/dri/r200/r200_ioctl.h
++++ b/src/mesa/drivers/dri/r200/r200_ioctl.h
+@@ -37,65 +37,30 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ #include "main/simple_list.h"
+ #include "radeon_dri.h"
+-#include "r200_lock.h"
++
++#include "radeon_bocs_wrapper.h"
+ 
+ #include "xf86drm.h"
+ #include "drm.h"
+ #include "radeon_drm.h"
+ 
+-extern void r200EmitState( r200ContextPtr rmesa );
+ extern void r200EmitVertexAOS( r200ContextPtr rmesa,
+-				 GLuint vertex_size,
+-				 GLuint offset );
++			       GLuint vertex_size,
++			       struct radeon_bo *bo,
++			       GLuint offset );
+ 
+ extern void r200EmitVbufPrim( r200ContextPtr rmesa,
+ 				GLuint primitive,
+ 				GLuint vertex_nr );
+ 
+-extern void r200FlushElts( r200ContextPtr rmesa );
++extern void r200FlushElts(GLcontext *ctx);
+ 
+ extern GLushort *r200AllocEltsOpenEnded( r200ContextPtr rmesa,
+ 					   GLuint primitive,
+ 					   GLuint min_nr );
+ 
+-extern void r200EmitAOS( r200ContextPtr rmesa,
+-			   struct r200_dma_region **regions,
+-			   GLuint n,
+-			   GLuint offset );
+-
+-extern void r200EmitBlit( r200ContextPtr rmesa,
+-			  GLuint color_fmt,
+-			  GLuint src_pitch,
+-			  GLuint src_offset,
+-			  GLuint dst_pitch,
+-			  GLuint dst_offset,
+-			  GLint srcx, GLint srcy,
+-			  GLint dstx, GLint dsty,
+-			  GLuint w, GLuint h );
+-
+-extern void r200EmitWait( r200ContextPtr rmesa, GLuint flags );
+-
+-extern void r200FlushCmdBuf( r200ContextPtr rmesa, const char * );
+-extern int r200FlushCmdBufLocked( r200ContextPtr rmesa, const char * caller );
+-
+-extern void r200RefillCurrentDmaRegion( r200ContextPtr rmesa );
+-
+-extern void r200AllocDmaRegion( r200ContextPtr rmesa,
+-				  struct r200_dma_region *region,
+-				  int bytes, 
+-				  int alignment );
+-
+-extern void r200ReleaseDmaRegion( r200ContextPtr rmesa,
+-				    struct r200_dma_region *region,
+-				    const char *caller );
+-
+-extern void r200CopyBuffer( __DRIdrawablePrivate *drawable,
+-			    const drm_clip_rect_t      *rect);
+-extern void r200PageFlip( __DRIdrawablePrivate *drawable );
+-extern void r200Flush( GLcontext *ctx );
+-extern void r200Finish( GLcontext *ctx );
+-extern void r200WaitForIdleLocked( r200ContextPtr rmesa );
+-extern void r200WaitForVBlank( r200ContextPtr rmesa );
++extern void r200EmitAOS(r200ContextPtr rmesa, GLuint nr, GLuint offset);
++
+ extern void r200InitIoctlFuncs( struct dd_function_table *functions );
+ 
+ extern void *r200AllocateMemoryMESA( __DRIscreen *screen, GLsizei size, GLfloat readfreq,
+@@ -119,8 +84,8 @@ void r200SetUpAtomList( r200ContextPtr rmesa );
+  */
+ #define R200_NEWPRIM( rmesa )			\
+ do {						\
+-   if ( rmesa->dma.flush )			\
+-      rmesa->dma.flush( rmesa );	\
++   if ( rmesa->radeon.dma.flush )			\
++      rmesa->radeon.dma.flush( rmesa->radeon.glCtx );	\
+ } while (0)
+ 
+ /* Can accomodate several state changes and primitive changes without
+@@ -130,7 +95,7 @@ do {						\
+ do {								\
+    R200_NEWPRIM( rmesa );					\
+    rmesa->hw.ATOM.dirty = GL_TRUE;				\
+-   rmesa->hw.is_dirty = GL_TRUE;				\
++   rmesa->radeon.hw.is_dirty = GL_TRUE;				\
+ } while (0)
+ 
+ #define R200_DB_STATE( ATOM )			        \
+@@ -139,13 +104,13 @@ do {								\
+ 
+ static INLINE int R200_DB_STATECHANGE( 
+    r200ContextPtr rmesa,
+-   struct r200_state_atom *atom )
++   struct radeon_state_atom *atom )
+ {
+    if (memcmp(atom->cmd, atom->lastcmd, atom->cmd_size*4)) {
+-      int *tmp;
++      GLuint *tmp;
+       R200_NEWPRIM( rmesa );
+       atom->dirty = GL_TRUE;
+-      rmesa->hw.is_dirty = GL_TRUE;
++      rmesa->radeon.hw.is_dirty = GL_TRUE;
+       tmp = atom->cmd; 
+       atom->cmd = atom->lastcmd;
+       atom->lastcmd = tmp;
+@@ -156,15 +121,6 @@ static INLINE int R200_DB_STATECHANGE(
+ }
+ 
+ 
+-/* Fire the buffered vertices no matter what.
+- */
+-#define R200_FIREVERTICES( rmesa )			\
+-do {							\
+-   if ( rmesa->store.cmd_used || rmesa->dma.flush ) {	\
+-      r200Flush( rmesa->glCtx );			\
+-   }							\
+-} while (0)
+-
+ /* Command lengths.  Note that any time you ensure ELTS_BUFSZ or VBUF_BUFSZ
+  * are available, you will also be adding an rmesa->state.max_state_size because
+  * r200EmitState is called from within r200EmitVbufPrim and r200FlushElts.
+@@ -174,36 +130,36 @@ do {							\
+ #define ELTS_BUFSZ(nr)	(12 + nr * 2)
+ #define VBUF_BUFSZ	(3 * sizeof(int))
+ 
+-/* Ensure that a minimum amount of space is available in the command buffer.
+- * This is used to ensure atomicity of state updates with the rendering requests
+- * that rely on them.
+- *
+- * An alternative would be to implement a "soft lock" such that when the buffer
+- * wraps at an inopportune time, we grab the lock, flush the current buffer,
+- * and hang on to the lock until the critical section is finished and we flush
+- * the buffer again and unlock.
+- */
+-static INLINE void r200EnsureCmdBufSpace( r200ContextPtr rmesa, int bytes )
++static inline uint32_t cmdpacket3(int cmd_type)
+ {
+-   if (rmesa->store.cmd_used + bytes > R200_CMD_BUF_SZ)
+-      r200FlushCmdBuf( rmesa, __FUNCTION__ );
+-   assert( bytes <= R200_CMD_BUF_SZ );
+-}
++  drm_radeon_cmd_header_t cmd;
+ 
+-/* Alloc space in the command buffer
+- */
+-static INLINE char *r200AllocCmdBuf( r200ContextPtr rmesa,
+-					 int bytes, const char *where )
+-{
+-   char * head;
++  cmd.i = 0;
++  cmd.header.cmd_type = cmd_type;
+ 
+-   if (rmesa->store.cmd_used + bytes > R200_CMD_BUF_SZ)
+-      r200FlushCmdBuf( rmesa, where );
++  return (uint32_t)cmd.i;
+ 
+-   head = rmesa->store.cmd_buf + rmesa->store.cmd_used;
+-   rmesa->store.cmd_used += bytes;
+-   assert( rmesa->store.cmd_used <= R200_CMD_BUF_SZ );
+-   return head;
+ }
+ 
++#define OUT_BATCH_PACKET3(packet, num_extra) do {	      \
++    if (!b_l_rmesa->radeonScreen->kernel_mm) {		      \
++      OUT_BATCH(cmdpacket3(RADEON_CMD_PACKET3));				      \
++      OUT_BATCH(CP_PACKET3((packet), (num_extra)));	      \
++    } else {						      \
++      OUT_BATCH(CP_PACKET2);				      \
++      OUT_BATCH(CP_PACKET3((packet), (num_extra)));	      \
++    }							      \
++  } while(0)
++
++#define OUT_BATCH_PACKET3_CLIP(packet, num_extra) do {	      \
++    if (!b_l_rmesa->radeonScreen->kernel_mm) {		      \
++      OUT_BATCH(cmdpacket3(RADEON_CMD_PACKET3_CLIP));	      \
++      OUT_BATCH(CP_PACKET3((packet), (num_extra)));	      \
++    } else {						      \
++      OUT_BATCH(CP_PACKET2);				      \
++      OUT_BATCH(CP_PACKET3((packet), (num_extra)));	      \
++    }							      \
++  } while(0)
++
++
+ #endif /* __R200_IOCTL_H__ */
+diff --git a/src/mesa/drivers/dri/r200/r200_lock.c b/src/mesa/drivers/dri/r200/r200_lock.c
+deleted file mode 100644
+index 99661a4..0000000
+--- a/src/mesa/drivers/dri/r200/r200_lock.c
++++ /dev/null
+@@ -1,116 +0,0 @@
+-/*
+-Copyright (C) The Weather Channel, Inc.  2002.  All Rights Reserved.
+-
+-The Weather Channel (TM) funded Tungsten Graphics to develop the
+-initial release of the Radeon 8500 driver under the XFree86 license.
+-This notice must be preserved.
+-
+-Permission is hereby granted, free of charge, to any person obtaining
+-a copy of this software and associated documentation files (the
+-"Software"), to deal in the Software without restriction, including
+-without limitation the rights to use, copy, modify, merge, publish,
+-distribute, sublicense, and/or sell copies of the Software, and to
+-permit persons to whom the Software is furnished to do so, subject to
+-the following conditions:
+-
+-The above copyright notice and this permission notice (including the
+-next paragraph) shall be included in all copies or substantial
+-portions of the Software.
+-
+-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+-EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+-MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+-IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+-LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+-OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+-WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+-
+-**************************************************************************/
+-
+-/*
+- * Authors:
+- *   Keith Whitwell <keith@tungstengraphics.com>
+- */
+- 
+-#include "r200_context.h"
+-#include "r200_lock.h"
+-#include "r200_tex.h"
+-#include "r200_state.h"
+-#include "r200_ioctl.h"
+-
+-#include "drirenderbuffer.h"
+-
+-
+-#if DEBUG_LOCKING
+-char *prevLockFile = NULL;
+-int prevLockLine = 0;
+-#endif
+-
+-/* Turn on/off page flipping according to the flags in the sarea:
+- */
+-static void
+-r200UpdatePageFlipping( r200ContextPtr rmesa )
+-{
+-   rmesa->doPageFlip = rmesa->sarea->pfState;
+-   if (rmesa->glCtx->WinSysDrawBuffer) {
+-      driFlipRenderbuffers(rmesa->glCtx->WinSysDrawBuffer,
+-                           rmesa->sarea->pfCurrentPage);
+-   }
+-}
+-
+-
+-
+-/* Update the hardware state.  This is called if another main/context.has
+- * grabbed the hardware lock, which includes the X server.  This
+- * function also updates the driver's window state after the X server
+- * moves, resizes or restacks a window -- the change will be reflected
+- * in the drawable position and clip rects.  Since the X server grabs
+- * the hardware lock when it changes the window state, this routine will
+- * automatically be called after such a change.
+- */
+-void r200GetLock( r200ContextPtr rmesa, GLuint flags )
+-{
+-   __DRIdrawablePrivate *drawable = rmesa->dri.drawable;
+-   __DRIdrawablePrivate *readable = rmesa->dri.readable;
+-   __DRIscreenPrivate *sPriv = rmesa->dri.screen;
+-   drm_radeon_sarea_t *sarea = rmesa->sarea;
+-   int i;
+-
+-   drmGetLock( rmesa->dri.fd, rmesa->dri.hwContext, flags );
+-
+-   /* The window might have moved, so we might need to get new clip
+-    * rects.
+-    *
+-    * NOTE: This releases and regrabs the hw lock to allow the X server
+-    * to respond to the DRI protocol request for new drawable info.
+-    * Since the hardware state depends on having the latest drawable
+-    * clip rects, all state checking must be done _after_ this call.
+-    */
+-   DRI_VALIDATE_DRAWABLE_INFO( sPriv, drawable );
+-   if (drawable != readable) {
+-      DRI_VALIDATE_DRAWABLE_INFO( sPriv, readable );
+-   }
+-
+-   if ( rmesa->lastStamp != drawable->lastStamp ) {
+-      r200UpdatePageFlipping( rmesa );
+-      r200SetCliprects( rmesa );
+-      r200UpdateViewportOffset( rmesa->glCtx );
+-      driUpdateFramebufferSize(rmesa->glCtx, drawable);
+-   }
+-
+-   R200_STATECHANGE( rmesa, ctx );
+-   if (rmesa->sarea->tiling_enabled) {
+-      rmesa->hw.ctx.cmd[CTX_RB3D_COLORPITCH] |= R200_COLOR_TILE_ENABLE;
+-   }
+-   else rmesa->hw.ctx.cmd[CTX_RB3D_COLORPITCH] &= ~R200_COLOR_TILE_ENABLE;
+-
+-   if ( sarea->ctx_owner != rmesa->dri.hwContext ) {
+-      sarea->ctx_owner = rmesa->dri.hwContext;
+-   }
+-
+-   for ( i = 0 ; i < rmesa->nr_heaps ; i++ ) {
+-      DRI_AGE_TEXTURES( rmesa->texture_heaps[ i ] );
+-   }
+-
+-   rmesa->lost_context = GL_TRUE;
+-}
+diff --git a/src/mesa/drivers/dri/r200/r200_lock.h b/src/mesa/drivers/dri/r200/r200_lock.h
+deleted file mode 100644
+index 4ff9890..0000000
+--- a/src/mesa/drivers/dri/r200/r200_lock.h
++++ /dev/null
+@@ -1,106 +0,0 @@
+-/*
+-Copyright (C) The Weather Channel, Inc.  2002.  All Rights Reserved.
+-
+-The Weather Channel (TM) funded Tungsten Graphics to develop the
+-initial release of the Radeon 8500 driver under the XFree86 license.
+-This notice must be preserved.
+-
+-Permission is hereby granted, free of charge, to any person obtaining
+-a copy of this software and associated documentation files (the
+-"Software"), to deal in the Software without restriction, including
+-without limitation the rights to use, copy, modify, merge, publish,
+-distribute, sublicense, and/or sell copies of the Software, and to
+-permit persons to whom the Software is furnished to do so, subject to
+-the following conditions:
+-
+-The above copyright notice and this permission notice (including the
+-next paragraph) shall be included in all copies or substantial
+-portions of the Software.
+-
+-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+-EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+-MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+-IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+-LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+-OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+-WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+-
+-**************************************************************************/
+-
+-/*
+- * Authors:
+- *   Keith Whitwell <keith@tungstengraphics.com>
+- */
+-
+-#ifndef __R200_LOCK_H__
+-#define __R200_LOCK_H__
+-
+-extern void r200GetLock( r200ContextPtr rmesa, GLuint flags );
+-
+-/* Turn DEBUG_LOCKING on to find locking conflicts.
+- */
+-#define DEBUG_LOCKING	0
+-
+-#if DEBUG_LOCKING
+-extern char *prevLockFile;
+-extern int prevLockLine;
+-
+-#define DEBUG_LOCK()							\
+-   do {									\
+-      prevLockFile = (__FILE__);					\
+-      prevLockLine = (__LINE__);					\
+-   } while (0)
+-
+-#define DEBUG_RESET()							\
+-   do {									\
+-      prevLockFile = 0;							\
+-      prevLockLine = 0;							\
+-   } while (0)
+-
+-#define DEBUG_CHECK_LOCK()						\
+-   do {									\
+-      if ( prevLockFile ) {						\
+-	 fprintf( stderr,						\
+-		  "LOCK SET!\n\tPrevious %s:%d\n\tCurrent: %s:%d\n",	\
+-		  prevLockFile, prevLockLine, __FILE__, __LINE__ );	\
+-	 exit( 1 );							\
+-      }									\
+-   } while (0)
+-
+-#else
+-
+-#define DEBUG_LOCK()
+-#define DEBUG_RESET()
+-#define DEBUG_CHECK_LOCK()
+-
+-#endif
+-
+-/*
+- * !!! We may want to separate locks from locks with validation.  This
+- * could be used to improve performance for those things commands that
+- * do not do any drawing !!!
+- */
+-
+-
+-/* Lock the hardware and validate our state.
+- */
+-#define LOCK_HARDWARE( rmesa )					\
+-   do {								\
+-      char __ret = 0;						\
+-      DEBUG_CHECK_LOCK();					\
+-      DRM_CAS( rmesa->dri.hwLock, rmesa->dri.hwContext,		\
+-	       (DRM_LOCK_HELD | rmesa->dri.hwContext), __ret );	\
+-      if ( __ret )						\
+-	 r200GetLock( rmesa, 0 );				\
+-      DEBUG_LOCK();						\
+-   } while (0)
+-
+-#define UNLOCK_HARDWARE( rmesa )					\
+-   do {									\
+-      DRM_UNLOCK( rmesa->dri.fd,					\
+-		  rmesa->dri.hwLock,					\
+-		  rmesa->dri.hwContext );				\
+-      DEBUG_RESET();							\
+-   } while (0)
+-
+-#endif /* __R200_LOCK_H__ */
+diff --git a/src/mesa/drivers/dri/r200/r200_maos_arrays.c b/src/mesa/drivers/dri/r200/r200_maos_arrays.c
+index 8512b9a..5dbc202 100644
+--- a/src/mesa/drivers/dri/r200/r200_maos_arrays.c
++++ b/src/mesa/drivers/dri/r200/r200_maos_arrays.c
+@@ -50,110 +50,6 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ #include "r200_maos.h"
+ #include "r200_tcl.h"
+ 
+-
+-#if 0
+-/* Usage:
+- *   - from r200_tcl_render
+- *   - call r200EmitArrays to ensure uptodate arrays in dma
+- *   - emit primitives (new type?) which reference the data
+- *       -- need to use elts for lineloop, quads, quadstrip/flat
+- *       -- other primitives are all well-formed (need tristrip-1,fake-poly)
+- *
+- */
+-static void emit_ubyte_rgba3( GLcontext *ctx,
+-		       struct r200_dma_region *rvb,
+-		       char *data,
+-		       int stride,
+-		       int count )
+-{
+-   int i;
+-   r200_color_t *out = (r200_color_t *)(rvb->start + rvb->address);
+-
+-   if (R200_DEBUG & DEBUG_VERTS)
+-      fprintf(stderr, "%s count %d stride %d out %p\n",
+-	      __FUNCTION__, count, stride, (void *)out);
+-
+-   for (i = 0; i < count; i++) {
+-      out->red   = *data;
+-      out->green = *(data+1);
+-      out->blue  = *(data+2);
+-      out->alpha = 0xFF;
+-      out++;
+-      data += stride;
+-   }
+-}
+-
+-static void emit_ubyte_rgba4( GLcontext *ctx,
+-			      struct r200_dma_region *rvb,
+-			      char *data,
+-			      int stride,
+-			      int count )
+-{
+-   int i;
+-   int *out = (int *)(rvb->address + rvb->start);
+-
+-   if (R200_DEBUG & DEBUG_VERTS)
+-      fprintf(stderr, "%s count %d stride %d\n",
+-	      __FUNCTION__, count, stride);
+-
+-   if (stride == 4) {
+-      for (i = 0; i < count; i++)
+-	 ((int *)out)[i] = LE32_TO_CPU(((int *)data)[i]);
+-   } else {
+-      for (i = 0; i < count; i++) {
+-	 *(int *)out++ = LE32_TO_CPU(*(int *)data);
+-	 data += stride;
+-      }
+-   }
+-}
+-
+-
+-static void emit_ubyte_rgba( GLcontext *ctx,
+-			     struct r200_dma_region *rvb,
+-			     char *data,
+-			     int size,
+-			     int stride,
+-			     int count )
+-{
+-   r200ContextPtr rmesa = R200_CONTEXT(ctx);
+-
+-   if (R200_DEBUG & DEBUG_VERTS)
+-      fprintf(stderr, "%s %d/%d\n", __FUNCTION__, count, size);
+-
+-   assert (!rvb->buf);
+-
+-   if (stride == 0) {
+-      r200AllocDmaRegion( rmesa, rvb, 4, 4 );
+-      count = 1;
+-      rvb->aos_start = GET_START(rvb);
+-      rvb->aos_stride = 0;
+-      rvb->aos_size = 1;
+-   }
+-   else {
+-      r200AllocDmaRegion( rmesa, rvb, 4 * count, 4 );	/* alignment? */
+-      rvb->aos_start = GET_START(rvb);
+-      rvb->aos_stride = 1;
+-      rvb->aos_size = 1;
+-   }
+-
+-   /* Emit the data
+-    */
+-   switch (size) {
+-   case 3:
+-      emit_ubyte_rgba3( ctx, rvb, data, stride, count );
+-      break;
+-   case 4:
+-      emit_ubyte_rgba4( ctx, rvb, data, stride, count );
+-      break;
+-   default:
+-      assert(0);
+-      exit(1);
+-      break;
+-   }
+-}
+-#endif
+-
+-
+ #if defined(USE_X86_ASM)
+ #define COPY_DWORDS( dst, src, nr )					\
+ do {									\
+@@ -174,204 +70,34 @@ do {						\
+ } while (0)
+ #endif
+ 
+-
+-static void emit_vecfog( GLcontext *ctx,
+-			 struct r200_dma_region *rvb,
+-			 char *data,
+-			 int stride,
+-			 int count )
++static void r200_emit_vecfog(GLcontext *ctx, struct radeon_aos *aos,
++			     GLvoid *data, int stride, int count)
+ {
+-   int i;
+-   GLfloat *out;
+-
+-   r200ContextPtr rmesa = R200_CONTEXT(ctx);
+-   
+-   if (R200_DEBUG & DEBUG_VERTS)
+-      fprintf(stderr, "%s count %d stride %d\n",
+-	      __FUNCTION__, count, stride);
+-
+-   assert (!rvb->buf);
+-
+-   if (stride == 0) {
+-      r200AllocDmaRegion( rmesa, rvb, 4, 4 );
+-      count = 1;
+-      rvb->aos_start = GET_START(rvb);
+-      rvb->aos_stride = 0;
+-      rvb->aos_size = 1;
+-   }
+-   else {
+-      r200AllocDmaRegion( rmesa, rvb, count * 4, 4 );	/* alignment? */
+-      rvb->aos_start = GET_START(rvb);
+-      rvb->aos_stride = 1;
+-      rvb->aos_size = 1;
+-   }
+-
+-   /* Emit the data
+-    */
+-   out = (GLfloat *)(rvb->address + rvb->start);
+-   for (i = 0; i < count; i++) {
+-      out[0] = r200ComputeFogBlendFactor( ctx, *(GLfloat *)data );
+-      out++;
+-      data += stride;
+-   }
+-
++	radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
++	uint32_t *out;
++	int i;
++	int size = 1;
++
++	if (stride == 0) {
++		radeonAllocDmaRegion(rmesa, &aos->bo, &aos->offset, size * 4, 32);
++		count = 1;
++		aos->stride = 0;
++	} else {
++		radeonAllocDmaRegion(rmesa, &aos->bo, &aos->offset, size * 4, 32);
++		aos->stride = size;
++	}
++
++	aos->components = size;
++	aos->count = count;
++
++	out = (uint32_t*)((char*)aos->bo->ptr + aos->offset);
++	for (i = 0; i < count; i++) {
++	  out[0] = r200ComputeFogBlendFactor( ctx, *(GLfloat *)data );
++	  out++;
++	  data += stride;
++	}
+ }
+ 
+-
+-static void emit_vec4( GLcontext *ctx,
+-		       struct r200_dma_region *rvb,
+-		       char *data,
+-		       int stride,
+-		       int count )
+-{
+-   int i;
+-   int *out = (int *)(rvb->address + rvb->start);
+-
+-   if (R200_DEBUG & DEBUG_VERTS)
+-      fprintf(stderr, "%s count %d stride %d\n",
+-	      __FUNCTION__, count, stride);
+-
+-   if (stride == 4)
+-      COPY_DWORDS( out, data, count );
+-   else
+-      for (i = 0; i < count; i++) {
+-	 out[0] = *(int *)data;
+-	 out++;
+-	 data += stride;
+-      }
+-}
+-
+-
+-static void emit_vec8( GLcontext *ctx,
+-		       struct r200_dma_region *rvb,
+-		       char *data,
+-		       int stride,
+-		       int count )
+-{
+-   int i;
+-   int *out = (int *)(rvb->address + rvb->start);
+-
+-   if (R200_DEBUG & DEBUG_VERTS)
+-      fprintf(stderr, "%s count %d stride %d\n",
+-	      __FUNCTION__, count, stride);
+-
+-   if (stride == 8)
+-      COPY_DWORDS( out, data, count*2 );
+-   else
+-      for (i = 0; i < count; i++) {
+-	 out[0] = *(int *)data;
+-	 out[1] = *(int *)(data+4);
+-	 out += 2;
+-	 data += stride;
+-      }
+-}
+-
+-static void emit_vec12( GLcontext *ctx,
+-		       struct r200_dma_region *rvb,
+-		       char *data,
+-		       int stride,
+-		       int count )
+-{
+-   int i;
+-   int *out = (int *)(rvb->address + rvb->start);
+-
+-   if (R200_DEBUG & DEBUG_VERTS)
+-      fprintf(stderr, "%s count %d stride %d out %p data %p\n",
+-	      __FUNCTION__, count, stride, (void *)out, (void *)data);
+-
+-   if (stride == 12)
+-      COPY_DWORDS( out, data, count*3 );
+-   else
+-      for (i = 0; i < count; i++) {
+-	 out[0] = *(int *)data;
+-	 out[1] = *(int *)(data+4);
+-	 out[2] = *(int *)(data+8);
+-	 out += 3;
+-	 data += stride;
+-      }
+-}
+-
+-static void emit_vec16( GLcontext *ctx,
+-			struct r200_dma_region *rvb,
+-			char *data,
+-			int stride,
+-			int count )
+-{
+-   int i;
+-   int *out = (int *)(rvb->address + rvb->start);
+-
+-   if (R200_DEBUG & DEBUG_VERTS)
+-      fprintf(stderr, "%s count %d stride %d\n",
+-	      __FUNCTION__, count, stride);
+-
+-   if (stride == 16)
+-      COPY_DWORDS( out, data, count*4 );
+-   else
+-      for (i = 0; i < count; i++) {
+-	 out[0] = *(int *)data;
+-	 out[1] = *(int *)(data+4);
+-	 out[2] = *(int *)(data+8);
+-	 out[3] = *(int *)(data+12);
+-	 out += 4;
+-	 data += stride;
+-      }
+-}
+-
+-
+-static void emit_vector( GLcontext *ctx,
+-			 struct r200_dma_region *rvb,
+-			 char *data,
+-			 int size,
+-			 int stride,
+-			 int count )
+-{
+-   r200ContextPtr rmesa = R200_CONTEXT(ctx);
+-
+-   if (R200_DEBUG & DEBUG_VERTS)
+-      fprintf(stderr, "%s count %d size %d stride %d\n",
+-	      __FUNCTION__, count, size, stride);
+-
+-   assert (!rvb->buf);
+-
+-   if (stride == 0) {
+-      r200AllocDmaRegion( rmesa, rvb, size * 4, 4 );
+-      count = 1;
+-      rvb->aos_start = GET_START(rvb);
+-      rvb->aos_stride = 0;
+-      rvb->aos_size = size;
+-   }
+-   else {
+-      r200AllocDmaRegion( rmesa, rvb, size * count * 4, 4 );	/* alignment? */
+-      rvb->aos_start = GET_START(rvb);
+-      rvb->aos_stride = size;
+-      rvb->aos_size = size;
+-   }
+-
+-   /* Emit the data
+-    */
+-   switch (size) {
+-   case 1:
+-      emit_vec4( ctx, rvb, data, stride, count );
+-      break;
+-   case 2:
+-      emit_vec8( ctx, rvb, data, stride, count );
+-      break;
+-   case 3:
+-      emit_vec12( ctx, rvb, data, stride, count );
+-      break;
+-   case 4:
+-      emit_vec16( ctx, rvb, data, stride, count );
+-      break;
+-   default:
+-      assert(0);
+-      exit(1);
+-      break;
+-   }
+-
+-}
+-
+-
+-
+ /* Emit any changed arrays to new GART memory, re-emit a packet to
+  * update the arrays.  
+  */
+@@ -379,12 +105,12 @@ void r200EmitArrays( GLcontext *ctx, GLubyte *vimap_rev )
+ {
+    r200ContextPtr rmesa = R200_CONTEXT( ctx );
+    struct vertex_buffer *VB = &TNL_CONTEXT( ctx )->vb;
+-   struct r200_dma_region **component = rmesa->tcl.aos_components;
+    GLuint nr = 0;
+    GLuint vfmt0 = 0, vfmt1 = 0;
+    GLuint count = VB->Count;
+    GLuint i, emitsize;
+ 
++   //   fprintf(stderr,"emit arrays\n");
+    for ( i = 0; i < 15; i++ ) {
+       GLubyte attrib = vimap_rev[i];
+       if (attrib != 255) {
+@@ -416,20 +142,20 @@ void r200EmitArrays( GLcontext *ctx, GLubyte *vimap_rev )
+ 	 case 3:
+ 	    /* special handling to fix up fog. Will get us into trouble with vbos...*/
+ 	    assert(attrib == VERT_ATTRIB_FOG);
+-	    if (!rmesa->tcl.vertex_data[i].buf) {
++	    if (!rmesa->tcl.aos[i].bo) {
+ 	       if (ctx->VertexProgram._Enabled)
+-		  emit_vector( ctx,
+-			 &(rmesa->tcl.vertex_data[i]),
+-			 (char *)VB->AttribPtr[attrib]->data,
+-			 1,
+-			 VB->AttribPtr[attrib]->stride,
+-			 count);
++		  rcommon_emit_vector( ctx,
++				       &(rmesa->tcl.aos[nr]),
++				       (char *)VB->AttribPtr[attrib]->data,
++				       1,
++				       VB->AttribPtr[attrib]->stride,
++				       count);
+ 	       else
+-		  emit_vecfog( ctx,
+-			 &(rmesa->tcl.vertex_data[i]),
+-			 (char *)VB->AttribPtr[attrib]->data,
+-			 VB->AttribPtr[attrib]->stride,
+-			 count);
++		 r200_emit_vecfog( ctx,
++				   &(rmesa->tcl.aos[nr]),
++				   (char *)VB->AttribPtr[attrib]->data,
++				   VB->AttribPtr[attrib]->stride,
++				   count);
+ 	    }
+ 	    vfmt0 |= R200_VTX_DISCRETE_FOG;
+ 	    goto after_emit;
+@@ -473,17 +199,17 @@ void r200EmitArrays( GLcontext *ctx, GLubyte *vimap_rev )
+ 	 default:
+ 	    assert(0);
+ 	 }
+-	 if (!rmesa->tcl.vertex_data[i].buf) {
+-	    emit_vector( ctx,
+-			 &(rmesa->tcl.vertex_data[i]),
+-			 (char *)VB->AttribPtr[attrib]->data,
+-			 emitsize,
+-			 VB->AttribPtr[attrib]->stride,
+-			 count );
++	 if (!rmesa->tcl.aos[nr].bo) {
++	   rcommon_emit_vector( ctx,
++				&(rmesa->tcl.aos[nr]),
++				(char *)VB->AttribPtr[attrib]->data,
++				emitsize,
++				VB->AttribPtr[attrib]->stride,
++				count );
+ 	 }
+ after_emit:
+ 	 assert(nr < 12);
+-	 component[nr++] = &rmesa->tcl.vertex_data[i];
++	 nr++;
+       }
+    }
+ 
+@@ -501,12 +227,11 @@ after_emit:
+ void r200ReleaseArrays( GLcontext *ctx, GLuint newinputs )
+ {
+    r200ContextPtr rmesa = R200_CONTEXT( ctx );
+-
+-   /* only do it for changed inputs ? */
+    int i;
+-   for (i = 0; i < 15; i++) {
+-      if (newinputs & (1 << i))
+-	 r200ReleaseDmaRegion( rmesa,
+-	    &rmesa->tcl.vertex_data[i], __FUNCTION__ );
++   for (i = 0; i < rmesa->tcl.nr_aos_components; i++) {
++     if (rmesa->tcl.aos[i].bo) {
++       radeon_bo_unref(rmesa->tcl.aos[i].bo);
++       rmesa->tcl.aos[i].bo = NULL;
++     }
+    }
+ }
+diff --git a/src/mesa/drivers/dri/r200/r200_pixel.c b/src/mesa/drivers/dri/r200/r200_pixel.c
+index be68821..a6c6558 100644
+--- a/src/mesa/drivers/dri/r200/r200_pixel.c
++++ b/src/mesa/drivers/dri/r200/r200_pixel.c
+@@ -51,7 +51,7 @@ check_color( const GLcontext *ctx, GLenum type, GLenum format,
+ 	     const void *pixels, GLint sz, GLint pitch )
+ {
+    r200ContextPtr rmesa = R200_CONTEXT(ctx);
+-   GLuint cpp = rmesa->r200Screen->cpp;
++   GLuint cpp = rmesa->radeon.radeonScreen->cpp;
+ 
+    if (R200_DEBUG & DEBUG_PIXEL)
+       fprintf(stderr, "%s\n", __FUNCTION__);
+@@ -137,8 +137,8 @@ clip_pixelrect( const GLcontext *ctx,
+    if (*height <= 0)
+       return GL_FALSE;
+ 
+-   *size = ((*y + *height - 1) * rmesa->r200Screen->frontPitch +
+-	    (*x + *width - 1) * rmesa->r200Screen->cpp);
++   *size = ((*y + *height - 1) * rmesa->radeon.radeonScreen->frontPitch +
++	    (*x + *width - 1) * rmesa->radeon.radeonScreen->cpp);
+ 
+    return GL_TRUE;
+ }
+@@ -153,19 +153,20 @@ r200TryReadPixels( GLcontext *ctx,
+    r200ContextPtr rmesa = R200_CONTEXT(ctx);
+    GLint pitch = pack->RowLength ? pack->RowLength : width;
+    GLint blit_format;
+-   GLuint cpp = rmesa->r200Screen->cpp;
++   GLuint cpp = rmesa->radeon.radeonScreen->cpp;
+    GLint size = width * height * cpp;
+ 
++   return GL_FALSE;
++#if 0
+    if (R200_DEBUG & DEBUG_PIXEL)
+       fprintf(stderr, "%s\n", __FUNCTION__);
+ 
+    /* Only accelerate reading to GART buffers.
+     */
+    if ( !r200IsGartMemory(rmesa, pixels, 
+-			 pitch * height * rmesa->r200Screen->cpp ) ) {
++			 pitch * height * rmesa->radeon.radeonScreen->cpp ) ) {
+       if (R200_DEBUG & DEBUG_PIXEL)
+ 	 fprintf(stderr, "%s: dest not GART\n", __FUNCTION__);
+-      return GL_FALSE;
+    }
+ 
+    /* Need GL_PACK_INVERT_MESA to cope with upsidedown results from
+@@ -180,7 +181,7 @@ r200TryReadPixels( GLcontext *ctx,
+    if (!check_color(ctx, type, format, pack, pixels, size, pitch))
+       return GL_FALSE;
+ 
+-   switch ( rmesa->r200Screen->cpp ) {
++   switch ( rmesa->radeon.radeonScreen->cpp ) {
+    case 4:
+       blit_format = R200_CP_COLOR_FORMAT_ARGB8888;
+       break;
+@@ -197,14 +198,14 @@ r200TryReadPixels( GLcontext *ctx,
+     * a full command buffer expects to be called unlocked.  As a
+     * workaround, immediately flush the buffer on aquiring the lock.
+     */
+-   LOCK_HARDWARE( rmesa );
++   LOCK_HARDWARE( &rmesa->radeon );
+ 
+    if (rmesa->store.cmd_used)
+-      r200FlushCmdBufLocked( rmesa, __FUNCTION__ );
++      rcommonFlushCmdBufLocked( &rmesa->radeon, __FUNCTION__ );
+ 
+    if (!clip_pixelrect(ctx, ctx->ReadBuffer, &x, &y, &width, &height,
+ 		       &size)) {
+-      UNLOCK_HARDWARE( rmesa );
++      UNLOCK_HARDWARE( &rmesa->radeon );
+       if (R200_DEBUG & DEBUG_PIXEL)
+ 	 fprintf(stderr, "%s totally clipped -- nothing to do\n",
+ 		 __FUNCTION__);
+@@ -212,14 +213,14 @@ r200TryReadPixels( GLcontext *ctx,
+    }
+ 
+    {
+-      __DRIdrawablePrivate *dPriv = rmesa->dri.drawable;
++      __DRIdrawablePrivate *dPriv = rmesa->radeon.dri.drawable;
+       driRenderbuffer *drb = (driRenderbuffer *) ctx->ReadBuffer->_ColorReadBuffer;
+       int nbox = dPriv->numClipRects;
+       int src_offset = drb->offset
+-		     + rmesa->r200Screen->fbLocation;
++		     + rmesa->radeon.radeonScreen->fbLocation;
+       int src_pitch = drb->pitch * drb->cpp;
+       int dst_offset = r200GartOffsetFromVirtual( rmesa, pixels );
+-      int dst_pitch = pitch * rmesa->r200Screen->cpp;
++      int dst_pitch = pitch * rmesa->radeon.radeonScreen->cpp;
+       drm_clip_rect_t *box = dPriv->pClipRects;
+       int i;
+ 
+@@ -257,12 +258,12 @@ r200TryReadPixels( GLcontext *ctx,
+ 		       bw, bh );
+       }
+ 
+-      r200FlushCmdBufLocked( rmesa, __FUNCTION__ );
++      rcommonFlushCmdBufLocked( &rmesa->radeon, __FUNCTION__ );
+    }
+-   UNLOCK_HARDWARE( rmesa );
+-
+-   r200Finish( ctx ); /* required by GL */
++   UNLOCK_HARDWARE( &rmesa->radeon );
+ 
++   radeonFinish( ctx ); /* required by GL */
 +#endif
-+    return bo;
+    return GL_TRUE;
+ }
+ 
+@@ -292,7 +293,7 @@ static void do_draw_pix( GLcontext *ctx,
+ 			 GLuint planemask)
+ {
+    r200ContextPtr rmesa = R200_CONTEXT(ctx);
+-   __DRIdrawablePrivate *dPriv = rmesa->dri.drawable;
++   __DRIdrawablePrivate *dPriv = rmesa->radeon.dri.drawable;
+    drm_clip_rect_t *box = dPriv->pClipRects;
+    struct gl_renderbuffer *rb = ctx->ReadBuffer->_ColorDrawBuffers[0];
+    driRenderbuffer *drb = (driRenderbuffer *) rb;
+@@ -301,12 +302,12 @@ static void do_draw_pix( GLcontext *ctx,
+    int blit_format;
+    int size;
+    int src_offset = r200GartOffsetFromVirtual( rmesa, pixels );
+-   int src_pitch = pitch * rmesa->r200Screen->cpp;
++   int src_pitch = pitch * rmesa->radeon.radeonScreen->cpp;
+ 
+    if (R200_DEBUG & DEBUG_PIXEL)
+       fprintf(stderr, "%s\n", __FUNCTION__);
+-
+-   switch ( rmesa->r200Screen->cpp ) {
++#if 0
++   switch ( rmesa->radeon.radeonScreen->cpp ) {
+    case 2:
+       blit_format = R200_CP_COLOR_FORMAT_RGB565;
+       break;
+@@ -318,17 +319,17 @@ static void do_draw_pix( GLcontext *ctx,
+    }
+ 
+ 
+-   LOCK_HARDWARE( rmesa );
++   LOCK_HARDWARE( &rmesa->radeon );
+ 
+    if (rmesa->store.cmd_used)
+-      r200FlushCmdBufLocked( rmesa, __FUNCTION__ );
++      rcommonFlushCmdBufLocked( &rmesa->radeon, __FUNCTION__ );
+ 
+    y -= height;			/* cope with pixel zoom */
+    
+    if (!clip_pixelrect(ctx, ctx->DrawBuffer,
+ 		       &x, &y, &width, &height,
+ 		       &size)) {
+-      UNLOCK_HARDWARE( rmesa );
++      UNLOCK_HARDWARE( &rmesa->radeon );
+       return;
+    }
+ 
+@@ -357,15 +358,16 @@ static void do_draw_pix( GLcontext *ctx,
+ 		    blit_format,
+ 		    src_pitch, src_offset,
+ 		    drb->pitch * drb->cpp,
+-		    drb->offset + rmesa->r200Screen->fbLocation,
++		    drb->offset + rmesa->radeon.radeonScreen->fbLocation,
+ 		    bx - x, by - y,
+ 		    bx, by,
+ 		    bw, bh );
+    }
+ 
+-   r200FlushCmdBufLocked( rmesa, __FUNCTION__ );
+-   r200WaitForIdleLocked( rmesa ); /* required by GL */
+-   UNLOCK_HARDWARE( rmesa );
++   rcommonFlushCmdBufLocked( &rmesa->radeon, __FUNCTION__ );
++   radeonWaitForIdleLocked( &rmesa->radeon ); /* required by GL */
++   UNLOCK_HARDWARE( &rmesa->radeon );
++#endif
+ }
+ 
+ 
+@@ -381,7 +383,7 @@ r200TryDrawPixels( GLcontext *ctx,
+    r200ContextPtr rmesa = R200_CONTEXT(ctx);
+    GLint pitch = unpack->RowLength ? unpack->RowLength : width;
+    GLuint planemask;
+-   GLuint cpp = rmesa->r200Screen->cpp;
++   GLuint cpp = rmesa->radeon.radeonScreen->cpp;
+    GLint size = height * pitch * cpp;
+ 
+    if (R200_DEBUG & DEBUG_PIXEL)
+@@ -395,7 +397,7 @@ r200TryDrawPixels( GLcontext *ctx,
+    case GL_RGB:
+    case GL_RGBA:
+    case GL_BGRA:
+-      planemask = r200PackColor(cpp,
++      planemask = radeonPackColor(cpp,
+ 				ctx->Color.ColorMask[RCOMP],
+ 				ctx->Color.ColorMask[GCOMP],
+ 				ctx->Color.ColorMask[BCOMP],
+@@ -431,7 +433,7 @@ r200TryDrawPixels( GLcontext *ctx,
+       return GL_FALSE;
+    }
+ 
+-   if ( r200IsGartMemory(rmesa, pixels, size) )
++   if (0)// r200IsGartMemory(rmesa, pixels, size) )
+    {
+       do_draw_pix( ctx, x, y, width, height, pitch, pixels, planemask );
+       return GL_TRUE;
+@@ -471,7 +473,7 @@ r200Bitmap( GLcontext *ctx, GLint px, GLint py,
+ {
+    r200ContextPtr rmesa = R200_CONTEXT(ctx);
+ 
+-   if (rmesa->Fallback)
++   if (rmesa->radeon.Fallback)
+       _swrast_Bitmap( ctx, px, py, width, height, unpack, bitmap );
+    else
+       r200PointsBitmap( ctx, px, py, width, height, unpack, bitmap );
+diff --git a/src/mesa/drivers/dri/r200/r200_reg.h b/src/mesa/drivers/dri/r200/r200_reg.h
+index 5ce287f..526a624 100644
+--- a/src/mesa/drivers/dri/r200/r200_reg.h
++++ b/src/mesa/drivers/dri/r200/r200_reg.h
+@@ -463,8 +463,11 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ #define     R200_VSC_UPDATE_USER_COLOR_1_ENABLE    0x00020000
+ /* gap */
+ #define R200_SE_TCL_VECTOR_INDX_REG                0x2200
++#       define RADEON_VEC_INDX_OCTWORD_STRIDE_SHIFT  16
++#       define RADEON_VEC_INDX_DWORD_COUNT_SHIFT     28
+ #define R200_SE_TCL_VECTOR_DATA_REG                0x2204
+ #define R200_SE_TCL_SCALAR_INDX_REG                0x2208
++#       define RADEON_SCAL_INDX_DWORD_STRIDE_SHIFT  16
+ #define R200_SE_TCL_SCALAR_DATA_REG                0x220c
+ /* gap */
+ #define R200_SE_TCL_MATRIX_SEL_0                   0x2230
+@@ -949,6 +952,9 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ #define     R200_LOD_BIAS_MASK                        (0xfff80000)
+ #define     R200_LOD_BIAS_SHIFT                       19
+ #define R200_PP_TXSIZE_0                  0x2c0c /* NPOT only */
++#define R200_PP_TX_WIDTHMASK_SHIFT 0
++#define R200_PP_TX_HEIGHTMASK_SHIFT 16
++
+ #define R200_PP_TXPITCH_0                 0x2c10 /* NPOT only */
+ #define R200_PP_BORDER_COLOR_0            0x2c14
+ #define R200_PP_CUBIC_FACES_0             0x2c18
+diff --git a/src/mesa/drivers/dri/r200/r200_span.c b/src/mesa/drivers/dri/r200/r200_span.c
+deleted file mode 100644
+index 9783678..0000000
+--- a/src/mesa/drivers/dri/r200/r200_span.c
++++ /dev/null
+@@ -1,307 +0,0 @@
+-/*
+-Copyright (C) The Weather Channel, Inc.  2002.  All Rights Reserved.
+-
+-The Weather Channel (TM) funded Tungsten Graphics to develop the
+-initial release of the Radeon 8500 driver under the XFree86 license.
+-This notice must be preserved.
+-
+-Permission is hereby granted, free of charge, to any person obtaining
+-a copy of this software and associated documentation files (the
+-"Software"), to deal in the Software without restriction, including
+-without limitation the rights to use, copy, modify, merge, publish,
+-distribute, sublicense, and/or sell copies of the Software, and to
+-permit persons to whom the Software is furnished to do so, subject to
+-the following conditions:
+-
+-The above copyright notice and this permission notice (including the
+-next paragraph) shall be included in all copies or substantial
+-portions of the Software.
+-
+-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+-EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+-MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+-IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+-LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+-OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+-WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+-
+-**************************************************************************/
+-
+-/*
+- * Authors:
+- *   Keith Whitwell <keith@tungstengraphics.com>
+- */
+-
+-#include "main/glheader.h"
+-#include "main/imports.h"
+-#include "main/colormac.h"
+-#include "swrast/swrast.h"
+-
+-#include "r200_context.h"
+-#include "r200_ioctl.h"
+-#include "r200_state.h"
+-#include "r200_span.h"
+-#include "r200_tex.h"
+-
+-#define DBG 0
+-
+-/*
+- * Note that all information needed to access pixels in a renderbuffer
+- * should be obtained through the gl_renderbuffer parameter, not per-context
+- * information.
+- */
+-#define LOCAL_VARS						\
+-   driRenderbuffer *drb = (driRenderbuffer *) rb;		\
+-   const __DRIdrawablePrivate *dPriv = drb->dPriv;		\
+-   const GLuint bottom = dPriv->h - 1;				\
+-   GLubyte *buf = (GLubyte *) drb->flippedData			\
+-      + (dPriv->y * drb->flippedPitch + dPriv->x) * drb->cpp;	\
+-   GLuint p;							\
+-   (void) p;
+-
+-#define LOCAL_DEPTH_VARS				\
+-   driRenderbuffer *drb = (driRenderbuffer *) rb;	\
+-   const __DRIdrawablePrivate *dPriv = drb->dPriv;	\
+-   const GLuint bottom = dPriv->h - 1;			\
+-   GLuint xo = dPriv->x;				\
+-   GLuint yo = dPriv->y;				\
+-   GLubyte *buf = (GLubyte *) drb->Base.Data;
+-
+-#define LOCAL_STENCIL_VARS LOCAL_DEPTH_VARS
+-
+-#define Y_FLIP(Y) (bottom - (Y))
+-
+-#define HW_LOCK() 
+-
+-#define HW_UNLOCK()							
+-
+-
+-
+-/* ================================================================
+- * Color buffer
+- */
+-
+-/* 16 bit, RGB565 color spanline and pixel functions
+- */
+-#define SPANTMP_PIXEL_FMT GL_RGB
+-#define SPANTMP_PIXEL_TYPE GL_UNSIGNED_SHORT_5_6_5
+-
+-#define TAG(x)    r200##x##_RGB565
+-#define TAG2(x,y) r200##x##_RGB565##y
+-#define GET_PTR(X,Y) (buf + ((Y) * drb->flippedPitch + (X)) * 2)
+-#include "spantmp2.h"
+-
+-/* 32 bit, ARGB8888 color spanline and pixel functions
+- */
+-#define SPANTMP_PIXEL_FMT GL_BGRA
+-#define SPANTMP_PIXEL_TYPE GL_UNSIGNED_INT_8_8_8_8_REV
+-
+-#define TAG(x)    r200##x##_ARGB8888
+-#define TAG2(x,y) r200##x##_ARGB8888##y
+-#define GET_PTR(X,Y) (buf + ((Y) * drb->flippedPitch + (X)) * 4)
+-#include "spantmp2.h"
+-
+-
+-/* ================================================================
+- * Depth buffer
+- */
+-
+-/* The Radeon family has depth tiling on all the time, so we have to convert
+- * the x,y coordinates into the memory bus address (mba) in the same
+- * manner as the engine.  In each case, the linear block address (ba)
+- * is calculated, and then wired with x and y to produce the final
+- * memory address.
+- * The chip will do address translation on its own if the surface registers
+- * are set up correctly. It is not quite enough to get it working with hyperz too...
+- */
+-
+-/* extract bit 'b' of x, result is zero or one */
+-#define BIT(x,b) ((x & (1<<b))>>b)
+-
+-static GLuint
+-r200_mba_z32( driRenderbuffer *drb, GLint x, GLint y )
+-{
+-   GLuint pitch = drb->pitch;
+-   if (drb->depthHasSurface) {
+-      return 4 * (x + y * pitch);
+-   }
+-   else {
+-      GLuint b = ((y & 0x7FF) >> 4) * ((pitch & 0xFFF) >> 5) + ((x & 0x7FF) >> 5);
+-      GLuint a = 
+-         (BIT(x,0) << 2) |
+-         (BIT(y,0) << 3) |
+-         (BIT(x,1) << 4) |
+-         (BIT(y,1) << 5) |
+-         (BIT(x,3) << 6) |
+-         (BIT(x,4) << 7) |
+-         (BIT(x,2) << 8) |
+-         (BIT(y,2) << 9) |
+-         (BIT(y,3) << 10) |
+-         (((pitch & 0x20) ? (b & 0x01) : ((b & 0x01) ^ (BIT(y,4)))) << 11) |
+-         ((b >> 1) << 12);
+-      return a;
+-   }
+-}
+-
+-static GLuint
+-r200_mba_z16( driRenderbuffer *drb, GLint x, GLint y )
+-{
+-   GLuint pitch = drb->pitch;
+-   if (drb->depthHasSurface) {
+-      return 2 * (x + y * pitch);
+-   }
+-   else {
+-      GLuint b = ((y & 0x7FF) >> 4) * ((pitch & 0xFFF) >> 6) + ((x & 0x7FF) >> 6);
+-      GLuint a = 
+-         (BIT(x,0) << 1) |
+-         (BIT(y,0) << 2) |
+-         (BIT(x,1) << 3) |
+-         (BIT(y,1) << 4) |
+-         (BIT(x,2) << 5) |
+-         (BIT(x,4) << 6) |
+-         (BIT(x,5) << 7) |
+-         (BIT(x,3) << 8) |
+-         (BIT(y,2) << 9) |
+-         (BIT(y,3) << 10) |
+-         (((pitch & 0x40) ? (b & 0x01) : ((b & 0x01) ^ (BIT(y,4)))) << 11) |
+-         ((b >> 1) << 12);
+-      return a;
+-   }
+-}
+-
+-
+-/* 16-bit depth buffer functions
+- */
+-#define VALUE_TYPE GLushort
+-
+-#define WRITE_DEPTH( _x, _y, d )					\
+-   *(GLushort *)(buf + r200_mba_z16( drb, _x + xo, _y + yo )) = d;
+-
+-#define READ_DEPTH( d, _x, _y )						\
+-   d = *(GLushort *)(buf + r200_mba_z16( drb, _x + xo, _y + yo ));
+-
+-#define TAG(x) r200##x##_z16
+-#include "depthtmp.h"
+-
+-
+-/* 24 bit depth, 8 bit stencil depthbuffer functions
+- */
+-#define VALUE_TYPE GLuint
+-
+-#define WRITE_DEPTH( _x, _y, d )					\
+-do {									\
+-   GLuint offset = r200_mba_z32( drb, _x + xo, _y + yo );		\
+-   GLuint tmp = *(GLuint *)(buf + offset);				\
+-   tmp &= 0xff000000;							\
+-   tmp |= ((d) & 0x00ffffff);						\
+-   *(GLuint *)(buf + offset) = tmp;					\
+-} while (0)
+-
+-#define READ_DEPTH( d, _x, _y )						\
+-   d = *(GLuint *)(buf + r200_mba_z32( drb, _x + xo,			\
+-					 _y + yo )) & 0x00ffffff;
+-
+-#define TAG(x) r200##x##_z24_s8
+-#include "depthtmp.h"
+-
+-
+-/* ================================================================
+- * Stencil buffer
+- */
+-
+-/* 24 bit depth, 8 bit stencil depthbuffer functions
+- */
+-#define WRITE_STENCIL( _x, _y, d )					\
+-do {									\
+-   GLuint offset = r200_mba_z32( drb, _x + xo, _y + yo );		\
+-   GLuint tmp = *(GLuint *)(buf + offset);				\
+-   tmp &= 0x00ffffff;							\
+-   tmp |= (((d) & 0xff) << 24);						\
+-   *(GLuint *)(buf + offset) = tmp;					\
+-} while (0)
+-
+-#define READ_STENCIL( d, _x, _y )					\
+-do {									\
+-   GLuint offset = r200_mba_z32( drb, _x + xo, _y + yo );		\
+-   GLuint tmp = *(GLuint *)(buf + offset);				\
+-   tmp &= 0xff000000;							\
+-   d = tmp >> 24;							\
+-} while (0)
+-
+-#define TAG(x) r200##x##_z24_s8
+-#include "stenciltmp.h"
+-
+-
+-/* Move locking out to get reasonable span performance (10x better
+- * than doing this in HW_LOCK above).  WaitForIdle() is the main
+- * culprit.
+- */
+-
+-static void r200SpanRenderStart( GLcontext *ctx )
+-{
+-   r200ContextPtr rmesa = R200_CONTEXT( ctx );
+-
+-   R200_FIREVERTICES( rmesa );
+-   LOCK_HARDWARE( rmesa );
+-   r200WaitForIdleLocked( rmesa );
+-
+-   /* Read & rewrite the first pixel in the frame buffer.  This should
+-    * be a noop, right?  In fact without this conform fails as reading
+-    * from the framebuffer sometimes produces old results -- the
+-    * on-card read cache gets mixed up and doesn't notice that the
+-    * framebuffer has been updated.
+-    *
+-    * In the worst case this is buggy too as p might get the wrong
+-    * value first time, so really need a hidden pixel somewhere for this.
+-    */
+-   {
+-      int p;
+-      driRenderbuffer *drb =
+-	 (driRenderbuffer *) ctx->WinSysDrawBuffer->_ColorDrawBuffers[0];
+-      volatile int *buf =
+-	 (volatile int *)(rmesa->dri.screen->pFB + drb->offset);
+-      p = *buf;
+-      *buf = p;
+-   }
+-}
+-
+-static void r200SpanRenderFinish( GLcontext *ctx )
+-{
+-   r200ContextPtr rmesa = R200_CONTEXT( ctx );
+-   _swrast_flush( ctx );
+-   UNLOCK_HARDWARE( rmesa );
+-}
+-
+-void r200InitSpanFuncs( GLcontext *ctx )
+-{
+-   struct swrast_device_driver *swdd = _swrast_GetDeviceDriverReference(ctx);
+-   swdd->SpanRenderStart          = r200SpanRenderStart;
+-   swdd->SpanRenderFinish         = r200SpanRenderFinish; 
+-}
+-
+-
+-
+-/**
+- * Plug in the Get/Put routines for the given driRenderbuffer.
+- */
+-void
+-radeonSetSpanFunctions(driRenderbuffer *drb, const GLvisual *vis)
+-{
+-   if (drb->Base.InternalFormat == GL_RGBA) {
+-      if (vis->redBits == 5 && vis->greenBits == 6 && vis->blueBits == 5) {
+-         r200InitPointers_RGB565(&drb->Base);
+-      }
+-      else {
+-         r200InitPointers_ARGB8888(&drb->Base);
+-      }
+-   }
+-   else if (drb->Base.InternalFormat == GL_DEPTH_COMPONENT16) {
+-      r200InitDepthPointers_z16(&drb->Base);
+-   }
+-   else if (drb->Base.InternalFormat == GL_DEPTH_COMPONENT24) {
+-      r200InitDepthPointers_z24_s8(&drb->Base);
+-   }
+-   else if (drb->Base.InternalFormat == GL_STENCIL_INDEX8_EXT) {
+-      r200InitStencilPointers_z24_s8(&drb->Base);
+-   }
+-}
+diff --git a/src/mesa/drivers/dri/r200/r200_span.h b/src/mesa/drivers/dri/r200/r200_span.h
+deleted file mode 100644
+index bae5644..0000000
+--- a/src/mesa/drivers/dri/r200/r200_span.h
++++ /dev/null
+@@ -1,45 +0,0 @@
+-/*
+-Copyright (C) The Weather Channel, Inc.  2002.  All Rights Reserved.
+-
+-The Weather Channel (TM) funded Tungsten Graphics to develop the
+-initial release of the Radeon 8500 driver under the XFree86 license.
+-This notice must be preserved.
+-
+-Permission is hereby granted, free of charge, to any person obtaining
+-a copy of this software and associated documentation files (the
+-"Software"), to deal in the Software without restriction, including
+-without limitation the rights to use, copy, modify, merge, publish,
+-distribute, sublicense, and/or sell copies of the Software, and to
+-permit persons to whom the Software is furnished to do so, subject to
+-the following conditions:
+-
+-The above copyright notice and this permission notice (including the
+-next paragraph) shall be included in all copies or substantial
+-portions of the Software.
+-
+-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+-EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+-MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+-IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+-LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+-OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+-WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+-
+-**************************************************************************/
+-
+-/*
+- * Authors:
+- *   Keith Whitwell <keith@tungstengraphics.com>
+- */
+-
+-#ifndef __R200_SPAN_H__
+-#define __R200_SPAN_H__
+-
+-#include "drirenderbuffer.h"
+-
+-extern void r200InitSpanFuncs( GLcontext *ctx );
+-
+-extern void
+-radeonSetSpanFunctions(driRenderbuffer *rb, const GLvisual *vis);
+-
+-#endif
+diff --git a/src/mesa/drivers/dri/r200/r200_state.c b/src/mesa/drivers/dri/r200/r200_state.c
+index 0eaaaf6..126f78b 100644
+--- a/src/mesa/drivers/dri/r200/r200_state.c
++++ b/src/mesa/drivers/dri/r200/r200_state.c
+@@ -47,6 +47,8 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ #include "tnl/t_pipeline.h"
+ #include "swrast_setup/swrast_setup.h"
+ 
++#include "radeon_common.h"
++#include "radeon_mipmap_tree.h"
+ #include "r200_context.h"
+ #include "r200_ioctl.h"
+ #include "r200_state.h"
+@@ -114,8 +116,8 @@ static void r200BlendColor( GLcontext *ctx, const GLfloat cf[4] )
+    CLAMPED_FLOAT_TO_UBYTE(color[1], cf[1]);
+    CLAMPED_FLOAT_TO_UBYTE(color[2], cf[2]);
+    CLAMPED_FLOAT_TO_UBYTE(color[3], cf[3]);
+-   if (rmesa->r200Screen->drmSupportsBlendColor)
+-      rmesa->hw.ctx.cmd[CTX_RB3D_BLENDCOLOR] = r200PackColor( 4, color[0], color[1], color[2], color[3] );
++   if (rmesa->radeon.radeonScreen->drmSupportsBlendColor)
++      rmesa->hw.ctx.cmd[CTX_RB3D_BLENDCOLOR] = radeonPackColor( 4, color[0], color[1], color[2], color[3] );
+ }
+ 
+ /**
+@@ -213,7 +215,7 @@ static void r200_set_blend_state( GLcontext * ctx )
+ 
+    R200_STATECHANGE( rmesa, ctx );
+ 
+-   if (rmesa->r200Screen->drmSupportsBlendColor) {
++   if (rmesa->radeon.radeonScreen->drmSupportsBlendColor) {
+       if (ctx->Color.ColorLogicOpEnabled) {
+          rmesa->hw.ctx.cmd[CTX_RB3D_CNTL] =  cntl | R200_ROP_ENABLE;
+          rmesa->hw.ctx.cmd[CTX_RB3D_ABLENDCNTL] = eqn | func;
+@@ -278,7 +280,7 @@ static void r200_set_blend_state( GLcontext * ctx )
+       return;
+    }
+ 
+-   if (!rmesa->r200Screen->drmSupportsBlendColor) {
++   if (!rmesa->radeon.radeonScreen->drmSupportsBlendColor) {
+       rmesa->hw.ctx.cmd[CTX_RB3D_BLENDCNTL] = eqn | func;
+       return;
+    }
+@@ -383,10 +385,10 @@ static void r200ClearDepth( GLcontext *ctx, GLclampd d )
+ 
+    switch ( format ) {
+    case R200_DEPTH_FORMAT_16BIT_INT_Z:
+-      rmesa->state.depth.clear = d * 0x0000ffff;
++      rmesa->radeon.state.depth.clear = d * 0x0000ffff;
+       break;
+    case R200_DEPTH_FORMAT_24BIT_INT_Z:
+-      rmesa->state.depth.clear = d * 0x00ffffff;
++      rmesa->radeon.state.depth.clear = d * 0x00ffffff;
+       break;
+    }
+ }
+@@ -480,7 +482,7 @@ static void r200Fogfv( GLcontext *ctx, GLenum pname, const GLfloat *param )
+    case GL_FOG_COLOR: 
+       R200_STATECHANGE( rmesa, ctx );
+       UNCLAMPED_FLOAT_TO_RGB_CHAN( col, ctx->Fog.Color );
+-      i = r200PackColor( 4, col[0], col[1], col[2], 0 );
++      i = radeonPackColor( 4, col[0], col[1], col[2], 0 );
+       rmesa->hw.ctx.cmd[CTX_PP_FOG_COLOR] &= ~R200_FOG_COLOR_MASK;
+       rmesa->hw.ctx.cmd[CTX_PP_FOG_COLOR] |= i;
+       break;
+@@ -521,102 +523,6 @@ static void r200Fogfv( GLcontext *ctx, GLenum pname, const GLfloat *param )
+    }
+ }
+ 
+-
+-/* =============================================================
+- * Scissoring
+- */
+-
+-
+-static GLboolean intersect_rect( drm_clip_rect_t *out,
+-				 drm_clip_rect_t *a,
+-				 drm_clip_rect_t *b )
+-{
+-   *out = *a;
+-   if ( b->x1 > out->x1 ) out->x1 = b->x1;
+-   if ( b->y1 > out->y1 ) out->y1 = b->y1;
+-   if ( b->x2 < out->x2 ) out->x2 = b->x2;
+-   if ( b->y2 < out->y2 ) out->y2 = b->y2;
+-   if ( out->x1 >= out->x2 ) return GL_FALSE;
+-   if ( out->y1 >= out->y2 ) return GL_FALSE;
+-   return GL_TRUE;
+-}
+-
+-
+-void r200RecalcScissorRects( r200ContextPtr rmesa )
+-{
+-   drm_clip_rect_t *out;
+-   int i;
+-
+-   /* Grow cliprect store?
+-    */
+-   if (rmesa->state.scissor.numAllocedClipRects < rmesa->numClipRects) {
+-      while (rmesa->state.scissor.numAllocedClipRects < rmesa->numClipRects) {
+-	 rmesa->state.scissor.numAllocedClipRects += 1;	/* zero case */
+-	 rmesa->state.scissor.numAllocedClipRects *= 2;
+-      }
+-
+-      if (rmesa->state.scissor.pClipRects)
+-	 FREE(rmesa->state.scissor.pClipRects);
+-
+-      rmesa->state.scissor.pClipRects = 
+-	 MALLOC( rmesa->state.scissor.numAllocedClipRects * 
+-		 sizeof(drm_clip_rect_t) );
+-
+-      if ( rmesa->state.scissor.pClipRects == NULL ) {
+-	 rmesa->state.scissor.numAllocedClipRects = 0;
+-	 return;
+-      }
+-   }
+-   
+-   out = rmesa->state.scissor.pClipRects;
+-   rmesa->state.scissor.numClipRects = 0;
+-
+-   for ( i = 0 ; i < rmesa->numClipRects ;  i++ ) {
+-      if ( intersect_rect( out, 
+-			   &rmesa->pClipRects[i], 
+-			   &rmesa->state.scissor.rect ) ) {
+-	 rmesa->state.scissor.numClipRects++;
+-	 out++;
+-      }
+-   }
+-}
+-
+-
+-static void r200UpdateScissor( GLcontext *ctx )
+-{
+-   r200ContextPtr rmesa = R200_CONTEXT(ctx);
+-
+-   if ( rmesa->dri.drawable ) {
+-      __DRIdrawablePrivate *dPriv = rmesa->dri.drawable;
+-
+-      int x = ctx->Scissor.X;
+-      int y = dPriv->h - ctx->Scissor.Y - ctx->Scissor.Height;
+-      int w = ctx->Scissor.X + ctx->Scissor.Width - 1;
+-      int h = dPriv->h - ctx->Scissor.Y - 1;
+-
+-      rmesa->state.scissor.rect.x1 = x + dPriv->x;
+-      rmesa->state.scissor.rect.y1 = y + dPriv->y;
+-      rmesa->state.scissor.rect.x2 = w + dPriv->x + 1;
+-      rmesa->state.scissor.rect.y2 = h + dPriv->y + 1;
+-
+-      r200RecalcScissorRects( rmesa );
+-   }
+-}
+-
+-
+-static void r200Scissor( GLcontext *ctx,
+-			   GLint x, GLint y, GLsizei w, GLsizei h )
+-{
+-   r200ContextPtr rmesa = R200_CONTEXT(ctx);
+-
+-   if ( ctx->Scissor.Enabled ) {
+-      R200_FIREVERTICES( rmesa );	/* don't pipeline cliprect changes */
+-      r200UpdateScissor( ctx );
+-   }
+-
+-}
+-
+-
+ /* =============================================================
+  * Culling
+  */
+@@ -803,7 +709,7 @@ static void r200ColorMask( GLcontext *ctx,
+ 			   GLboolean b, GLboolean a )
+ {
+    r200ContextPtr rmesa = R200_CONTEXT(ctx);
+-   GLuint mask = r200PackColor( rmesa->r200Screen->cpp,
++   GLuint mask = radeonPackColor( rmesa->radeon.radeonScreen->cpp,
+ 				ctx->Color.ColorMask[RCOMP],
+ 				ctx->Color.ColorMask[GCOMP],
+ 				ctx->Color.ColorMask[BCOMP],
+@@ -834,7 +740,7 @@ static void r200PolygonOffset( GLcontext *ctx,
+ 			       GLfloat factor, GLfloat units )
+ {
+    r200ContextPtr rmesa = R200_CONTEXT(ctx);
+-   float_ui32_type constant =  { units * rmesa->state.depth.scale };
++   float_ui32_type constant =  { units * rmesa->radeon.state.depth.scale };
+    float_ui32_type factoru = { factor };
+ 
+ /*    factor *= 2; */
+@@ -861,15 +767,15 @@ static void r200PolygonStipple( GLcontext *ctx, const GLubyte *mask )
+ 
+    /* TODO: push this into cmd mechanism
+     */
+-   R200_FIREVERTICES( rmesa );
+-   LOCK_HARDWARE( rmesa );
++   radeon_firevertices(&rmesa->radeon);
++   LOCK_HARDWARE( &rmesa->radeon );
+ 
+    /* FIXME: Use window x,y offsets into stipple RAM.
+     */
+    stipple.mask = rmesa->state.stipple.mask;
+-   drmCommandWrite( rmesa->dri.fd, DRM_RADEON_STIPPLE, 
++   drmCommandWrite( rmesa->radeon.dri.fd, DRM_RADEON_STIPPLE, 
+                     &stipple, sizeof(stipple) );
+-   UNLOCK_HARDWARE( rmesa );
++   UNLOCK_HARDWARE( &rmesa->radeon );
+ }
+ 
+ static void r200PolygonMode( GLcontext *ctx, GLenum face, GLenum mode )
+@@ -881,7 +787,7 @@ static void r200PolygonMode( GLcontext *ctx, GLenum face, GLenum mode )
+     * cases work. 
+     */
+    TCL_FALLBACK( ctx, R200_TCL_FALLBACK_UNFILLED, flag);
+-   if (rmesa->TclFallback) {
++   if (rmesa->radeon.TclFallback) {
+       r200ChooseRenderState( ctx );
+       r200ChooseVertexState( ctx );
+    }
+@@ -958,7 +864,7 @@ static void r200UpdateSpecular( GLcontext *ctx )
+ 
+    /* Update vertex/render formats
+     */
+-   if (rmesa->TclFallback) { 
++   if (rmesa->radeon.TclFallback) { 
+       r200ChooseRenderState( ctx );
+       r200ChooseVertexState( ctx );
+    }
+@@ -1430,7 +1336,7 @@ static void r200LightModelfv( GLcontext *ctx, GLenum pname,
+ 	    rmesa->hw.tcl.cmd[TCL_LIGHT_MODEL_CTL_0] |= R200_LIGHT_TWOSIDE;
+ 	 else
+ 	    rmesa->hw.tcl.cmd[TCL_LIGHT_MODEL_CTL_0] &= ~(R200_LIGHT_TWOSIDE);
+-	 if (rmesa->TclFallback) {
++	 if (rmesa->radeon.TclFallback) {
+ 	    r200ChooseRenderState( ctx );
+ 	    r200ChooseVertexState( ctx );
+ 	 }
+@@ -1675,7 +1581,7 @@ static void r200ClearStencil( GLcontext *ctx, GLint s )
+ {
+    r200ContextPtr rmesa = R200_CONTEXT(ctx);
+ 
+-   rmesa->state.stencil.clear = 
++   rmesa->radeon.state.stencil.clear = 
+       ((GLuint) (ctx->Stencil.Clear & 0xff) |
+        (0xff << R200_STENCIL_MASK_SHIFT) |
+        ((ctx->Stencil.WriteMask[0] & 0xff) << R200_STENCIL_WRITEMASK_SHIFT));
+@@ -1700,19 +1606,19 @@ static void r200ClearStencil( GLcontext *ctx, GLint s )
+ void r200UpdateWindow( GLcontext *ctx )
+ {
+    r200ContextPtr rmesa = R200_CONTEXT(ctx);
+-   __DRIdrawablePrivate *dPriv = rmesa->dri.drawable;
+-   GLfloat xoffset = (GLfloat)dPriv->x;
+-   GLfloat yoffset = (GLfloat)dPriv->y + dPriv->h;
++   __DRIdrawablePrivate *dPriv = rmesa->radeon.dri.drawable;
++   GLfloat xoffset = dPriv ? (GLfloat) dPriv->x : 0;
++   GLfloat yoffset = dPriv ? (GLfloat) dPriv->y + dPriv->h : 0;
+    const GLfloat *v = ctx->Viewport._WindowMap.m;
+ 
+    float_ui32_type sx = { v[MAT_SX] };
+    float_ui32_type tx = { v[MAT_TX] + xoffset + SUBPIXEL_X };
+    float_ui32_type sy = { - v[MAT_SY] };
+    float_ui32_type ty = { (- v[MAT_TY]) + yoffset + SUBPIXEL_Y };
+-   float_ui32_type sz = { v[MAT_SZ] * rmesa->state.depth.scale };
+-   float_ui32_type tz = { v[MAT_TZ] * rmesa->state.depth.scale };
++   float_ui32_type sz = { v[MAT_SZ] * rmesa->radeon.state.depth.scale };
++   float_ui32_type tz = { v[MAT_TZ] * rmesa->radeon.state.depth.scale };
+ 
+-   R200_FIREVERTICES( rmesa );
++   radeon_firevertices(&rmesa->radeon);
+    R200_STATECHANGE( rmesa, vpt );
+ 
+    rmesa->hw.vpt.cmd[VPT_SE_VPORT_XSCALE]  = sx.ui32;
+@@ -1744,7 +1650,7 @@ static void r200DepthRange( GLcontext *ctx, GLclampd nearval,
+ void r200UpdateViewportOffset( GLcontext *ctx )
+ {
+    r200ContextPtr rmesa = R200_CONTEXT(ctx);
+-   __DRIdrawablePrivate *dPriv = rmesa->dri.drawable;
++   __DRIdrawablePrivate *dPriv = rmesa->radeon.dri.drawable;
+    GLfloat xoffset = (GLfloat)dPriv->x;
+    GLfloat yoffset = (GLfloat)dPriv->y + dPriv->h;
+    const GLfloat *v = ctx->Viewport._WindowMap.m;
+@@ -1774,8 +1680,8 @@ void r200UpdateViewportOffset( GLcontext *ctx )
+                 R200_STIPPLE_Y_OFFSET_MASK);
+ 
+          /* add magic offsets, then invert */
+-         stx = 31 - ((rmesa->dri.drawable->x - 1) & R200_STIPPLE_COORD_MASK);
+-         sty = 31 - ((rmesa->dri.drawable->y + rmesa->dri.drawable->h - 1)
++         stx = 31 - ((rmesa->radeon.dri.drawable->x - 1) & R200_STIPPLE_COORD_MASK);
++         sty = 31 - ((rmesa->radeon.dri.drawable->y + rmesa->radeon.dri.drawable->h - 1)
+                      & R200_STIPPLE_COORD_MASK);
+ 
+          m |= ((stx << R200_STIPPLE_X_OFFSET_SHIFT) |
+@@ -1788,7 +1694,7 @@ void r200UpdateViewportOffset( GLcontext *ctx )
+       }
+    }
+ 
+-   r200UpdateScissor( ctx );
++   radeonUpdateScissor( ctx );
+ }
+ 
+ 
+@@ -1805,7 +1711,7 @@ static void r200ClearColor( GLcontext *ctx, const GLfloat c[4] )
+    CLAMPED_FLOAT_TO_UBYTE(color[1], c[1]);
+    CLAMPED_FLOAT_TO_UBYTE(color[2], c[2]);
+    CLAMPED_FLOAT_TO_UBYTE(color[3], c[3]);
+-   rmesa->state.color.clear = r200PackColor( rmesa->r200Screen->cpp,
++   rmesa->radeon.state.color.clear = radeonPackColor( rmesa->radeon.radeonScreen->cpp,
+                                              color[0], color[1],
+                                              color[2], color[3] );
+ }
+@@ -1849,56 +1755,6 @@ static void r200LogicOpCode( GLcontext *ctx, GLenum opcode )
+ }
+ 
+ 
+-/*
+- * Set up the cliprects for either front or back-buffer drawing.
+- */
+-void r200SetCliprects( r200ContextPtr rmesa )
+-{
+-   __DRIdrawablePrivate *const drawable = rmesa->dri.drawable;
+-   __DRIdrawablePrivate *const readable = rmesa->dri.readable;
+-   GLframebuffer *const draw_fb = (GLframebuffer*) drawable->driverPrivate;
+-   GLframebuffer *const read_fb = (GLframebuffer*) readable->driverPrivate;
+-
+-   if (draw_fb->_ColorDrawBufferIndexes[0] == BUFFER_BIT_BACK_LEFT) {
+-      /* Can't ignore 2d windows if we are page flipping.
+-       */
+-      if ( drawable->numBackClipRects == 0 || rmesa->doPageFlip ) {
+-         rmesa->numClipRects = drawable->numClipRects;
+-         rmesa->pClipRects = drawable->pClipRects;
+-      }
+-      else {
+-         rmesa->numClipRects = drawable->numBackClipRects;
+-         rmesa->pClipRects = drawable->pBackClipRects;
+-      }
+-   }
+-   else {
+-     /* front buffer (or none, or multiple buffers) */
+-     rmesa->numClipRects = drawable->numClipRects;
+-     rmesa->pClipRects = drawable->pClipRects;
+-  }
+-
+-   if ((draw_fb->Width != drawable->w) || (draw_fb->Height != drawable->h)) {
+-      _mesa_resize_framebuffer(rmesa->glCtx, draw_fb,
+-			       drawable->w, drawable->h);
+-      draw_fb->Initialized = GL_TRUE;
+-   }
+-
+-   if (drawable != readable) {
+-      if ((read_fb->Width != readable->w) ||
+-	  (read_fb->Height != readable->h)) {
+-	 _mesa_resize_framebuffer(rmesa->glCtx, read_fb,
+-				  readable->w, readable->h);
+-	 read_fb->Initialized = GL_TRUE;
+-      }
+-   }
+-
+-   if (rmesa->state.scissor.enabled)
+-      r200RecalcScissorRects( rmesa );
+-
+-   rmesa->lastStamp = drawable->lastStamp;
+-}
+-
+-
+ static void r200DrawBuffer( GLcontext *ctx, GLenum mode )
+ {
+    r200ContextPtr rmesa = R200_CONTEXT(ctx);
+@@ -1907,7 +1763,7 @@ static void r200DrawBuffer( GLcontext *ctx, GLenum mode )
+       fprintf(stderr, "%s %s\n", __FUNCTION__,
+ 	      _mesa_lookup_enum_by_nr( mode ));
+ 
+-   R200_FIREVERTICES(rmesa);	/* don't pipeline cliprect changes */
++   radeon_firevertices(&rmesa->radeon);	/* don't pipeline cliprect changes */
+ 
+    if (ctx->DrawBuffer->_NumColorDrawBuffers != 1) {
+       /* 0 (GL_NONE) buffers or multiple color drawing buffers */
+@@ -1925,7 +1781,8 @@ static void r200DrawBuffer( GLcontext *ctx, GLenum mode )
+       return;
+    }
+ 
+-   r200SetCliprects( rmesa );
++   radeonSetCliprects( &rmesa->radeon );
++   radeonUpdatePageFlipping(&rmesa->radeon);
+ 
+    /* We'll set the drawing engine's offset/pitch parameters later
+     * when we update other state.
+@@ -2013,10 +1870,10 @@ static void r200Enable( GLcontext *ctx, GLenum cap, GLboolean state )
+       R200_STATECHANGE(rmesa, ctx );
+       if ( state ) {
+ 	 rmesa->hw.ctx.cmd[CTX_RB3D_CNTL] |=  R200_DITHER_ENABLE;
+-	 rmesa->hw.ctx.cmd[CTX_RB3D_CNTL] &= ~rmesa->state.color.roundEnable;
++	 rmesa->hw.ctx.cmd[CTX_RB3D_CNTL] &= ~rmesa->radeon.state.color.roundEnable;
+       } else {
+ 	 rmesa->hw.ctx.cmd[CTX_RB3D_CNTL] &= ~R200_DITHER_ENABLE;
+-	 rmesa->hw.ctx.cmd[CTX_RB3D_CNTL] |=  rmesa->state.color.roundEnable;
++	 rmesa->hw.ctx.cmd[CTX_RB3D_CNTL] |=  rmesa->radeon.state.color.roundEnable;
+       }
+       break;
+ 
+@@ -2031,7 +1888,7 @@ static void r200Enable( GLcontext *ctx, GLenum cap, GLboolean state )
+ 	 rmesa->hw.tcl.cmd[TCL_UCP_VERT_BLEND_CTL] &= ~R200_TCL_FOG_MASK;
+       }
+       r200UpdateSpecular( ctx ); /* for PK_SPEC */
+-      if (rmesa->TclFallback) 
++      if (rmesa->radeon.TclFallback) 
+ 	 r200ChooseVertexState( ctx );
+       _mesa_allow_light_in_model( ctx, !state );
+       break;
+@@ -2068,7 +1925,7 @@ static void r200Enable( GLcontext *ctx, GLenum cap, GLboolean state )
+    case GL_LIGHTING:
+       r200UpdateSpecular(ctx);
+       /* for reflection map fixup - might set recheck_texgen for all units too */
+-      rmesa->NewGLState |= _NEW_TEXTURE;
++      rmesa->radeon.NewGLState |= _NEW_TEXTURE;
+       break;
+ 
+    case GL_LINE_SMOOTH:
+@@ -2181,13 +2038,13 @@ static void r200Enable( GLcontext *ctx, GLenum cap, GLboolean state )
+    }
+ 
+    case GL_SCISSOR_TEST:
+-      R200_FIREVERTICES( rmesa );
+-      rmesa->state.scissor.enabled = state;
+-      r200UpdateScissor( ctx );
++      radeon_firevertices(&rmesa->radeon);
++      rmesa->radeon.state.scissor.enabled = state;
++      radeonUpdateScissor( ctx );
+       break;
+ 
+    case GL_STENCIL_TEST:
+-      if ( rmesa->state.stencil.hwBuffer ) {
++      if ( rmesa->radeon.state.stencil.hwBuffer ) {
+ 	 R200_STATECHANGE( rmesa, ctx );
+ 	 if ( state ) {
+ 	    rmesa->hw.ctx.cmd[CTX_RB3D_CNTL] |=  R200_STENCIL_ENABLE;
+@@ -2443,42 +2300,99 @@ r200UpdateDrawBuffer(GLcontext *ctx)
+ {
+    r200ContextPtr rmesa = R200_CONTEXT(ctx);
+    struct gl_framebuffer *fb = ctx->DrawBuffer;
+-   driRenderbuffer *drb;
++   struct radeon_renderbuffer *rrb;
+ 
+    if (fb->_ColorDrawBufferIndexes[0] == BUFFER_FRONT_LEFT) {
+-      /* draw to front */
+-      drb = (driRenderbuffer *) fb->Attachment[BUFFER_FRONT_LEFT].Renderbuffer;
+-   }
+-   else if (fb->_ColorDrawBufferIndexes[0] == BUFFER_BACK_LEFT) {
+-      /* draw to back */
+-      drb = (driRenderbuffer *) fb->Attachment[BUFFER_BACK_LEFT].Renderbuffer;
+-   }
+-   else {
+-      /* drawing to multiple buffers, or none */
+-      return;
++     /* draw to front */
++     rrb = (void *) fb->Attachment[BUFFER_FRONT_LEFT].Renderbuffer;
++   } else if (fb->_ColorDrawBufferIndexes[0] == BUFFER_BACK_LEFT) {
++     /* draw to back */
++     rrb = (void *) fb->Attachment[BUFFER_BACK_LEFT].Renderbuffer;
++   } else {
++     /* drawing to multiple buffers, or none */
++     return;
+    }
+ 
+-   assert(drb);
+-   assert(drb->flippedPitch);
++   assert(rrb);
++   assert(rrb->pitch);
+ 
+    R200_STATECHANGE( rmesa, ctx );
+ 
++#if 0
+    /* Note: we used the (possibly) page-flipped values */
+    rmesa->hw.ctx.cmd[CTX_RB3D_COLOROFFSET]
+-     = ((drb->flippedOffset + rmesa->r200Screen->fbLocation)
++     = ((rrb->flippedOffset + rmesa->radeon.radeonScreen->fbLocation)
+ 	& R200_COLOROFFSET_MASK);
+    rmesa->hw.ctx.cmd[CTX_RB3D_COLORPITCH] = drb->flippedPitch;
+-   if (rmesa->sarea->tiling_enabled) {
++   if (rmesa->radeon.sarea->tiling_enabled) {
+       rmesa->hw.ctx.cmd[CTX_RB3D_COLORPITCH] |= R200_COLOR_TILE_ENABLE;
+    }
++#endif
+ }
+ 
++static GLboolean r200ValidateBuffers(GLcontext *ctx)
++{
++   r200ContextPtr rmesa = R200_CONTEXT(ctx);
++   struct radeon_cs_space_check bos[8];
++   struct radeon_renderbuffer *rrb;
++   int num_bo = 0;
++   int i;
++   int flushed = 0, ret;
++again:
++   num_bo = 0;
++   
++   rrb = radeon_get_colorbuffer(&rmesa->radeon);
++   /* color buffer */
++   if (rrb && rrb->bo) {
++      bos[num_bo].bo = rrb->bo;
++      bos[num_bo].read_domains = 0;
++      bos[num_bo].write_domain = RADEON_GEM_DOMAIN_VRAM;
++      bos[num_bo].new_accounted = 0;
++      num_bo++;
++   }
++
++   /* depth buffer */
++   rrb = radeon_get_depthbuffer(&rmesa->radeon);
++   /* color buffer */
++   if (rrb && rrb->bo) {
++      bos[num_bo].bo = rrb->bo;
++      bos[num_bo].read_domains = 0;
++      bos[num_bo].write_domain = RADEON_GEM_DOMAIN_VRAM;
++      bos[num_bo].new_accounted = 0;
++      num_bo++;
++   }
++
++   for (i = 0; i < ctx->Const.MaxTextureImageUnits; ++i) {
++      radeonTexObj *t;
++      
++      if (!ctx->Texture.Unit[i]._ReallyEnabled)
++	 continue;
++      
++      t = radeon_tex_obj(ctx->Texture.Unit[i]._Current);
++      bos[num_bo].bo = t->mt->bo;
++      bos[num_bo].read_domains = RADEON_GEM_DOMAIN_GTT | RADEON_GEM_DOMAIN_VRAM;
++      bos[num_bo].write_domain = 0;
++      bos[num_bo].new_accounted = 0;
++      num_bo++;
++   }
++   
++   ret = radeon_cs_space_check(rmesa->radeon.cmdbuf.cs, bos, num_bo);
++   if (ret == RADEON_CS_SPACE_OP_TO_BIG)
++      return GL_FALSE;
++   if (ret == RADEON_CS_SPACE_FLUSH) {
++      radeonFlush(ctx);
++      if (flushed)
++	 return GL_FALSE;
++      flushed = 1;
++      goto again;
++   }
++   return GL_TRUE;
 +}
+ 
+-
+-void r200ValidateState( GLcontext *ctx )
++GLboolean r200ValidateState( GLcontext *ctx )
+ {
+    r200ContextPtr rmesa = R200_CONTEXT(ctx);
+-   GLuint new_state = rmesa->NewGLState;
++   GLuint new_state = rmesa->radeon.NewGLState;
+ 
+    if (new_state & (_NEW_BUFFERS | _NEW_COLOR | _NEW_PIXEL)) {
+      r200UpdateDrawBuffer(ctx);
+@@ -2486,10 +2400,14 @@ void r200ValidateState( GLcontext *ctx )
+ 
+    if (new_state & (_NEW_TEXTURE | _NEW_PROGRAM)) {
+       r200UpdateTextureState( ctx );
+-      new_state |= rmesa->NewGLState; /* may add TEXTURE_MATRIX */
++      new_state |= rmesa->radeon.NewGLState; /* may add TEXTURE_MATRIX */
+       r200UpdateLocalViewer( ctx );
+    }
+ 
++   /* we need to do a space check here */
++   if (!r200ValidateBuffers(ctx))
++     return GL_FALSE;
++
+ /* FIXME: don't really need most of these when vertex progs are enabled */
+ 
+    /* Need an event driven matrix update?
+@@ -2533,7 +2451,8 @@ void r200ValidateState( GLcontext *ctx )
+       else TCL_FALLBACK(ctx, R200_TCL_FALLBACK_VERTEX_PROGRAM, 0);
+    }
+ 
+-   rmesa->NewGLState = 0;
++   rmesa->radeon.NewGLState = 0;
++   return GL_TRUE;
+ }
+ 
+ 
+@@ -2544,7 +2463,7 @@ static void r200InvalidateState( GLcontext *ctx, GLuint new_state )
+    _vbo_InvalidateState( ctx, new_state );
+    _tnl_InvalidateState( ctx, new_state );
+    _ae_invalidate_state( ctx, new_state );
+-   R200_CONTEXT(ctx)->NewGLState |= new_state;
++   R200_CONTEXT(ctx)->radeon.NewGLState |= new_state;
+ }
+ 
+ /* A hack.  The r200 can actually cope just fine with materials
+@@ -2573,12 +2492,13 @@ static void r200WrapRunPipeline( GLcontext *ctx )
+    GLboolean has_material;
+ 
+    if (0)
+-      fprintf(stderr, "%s, newstate: %x\n", __FUNCTION__, rmesa->NewGLState);
++      fprintf(stderr, "%s, newstate: %x\n", __FUNCTION__, rmesa->radeon.NewGLState);
+ 
+    /* Validate state:
+     */
+-   if (rmesa->NewGLState)
+-      r200ValidateState( ctx );
++   if (rmesa->radeon.NewGLState)
++      if (!r200ValidateState( ctx ))
++	 FALLBACK(rmesa, RADEON_FALLBACK_TEXTURE, GL_TRUE);
+ 
+    has_material = !ctx->VertexProgram._Enabled && ctx->Light.Enabled && check_material( ctx );
+ 
+@@ -2636,7 +2556,7 @@ void r200InitStateFuncs( struct dd_function_table *functions )
+    functions->PointParameterfv		= r200PointParameter;
+    functions->PointSize			= r200PointSize;
+    functions->RenderMode		= r200RenderMode;
+-   functions->Scissor			= r200Scissor;
++   functions->Scissor			= radeonScissor;
+    functions->ShadeModel		= r200ShadeModel;
+    functions->StencilFuncSeparate	= r200StencilFuncSeparate;
+    functions->StencilMaskSeparate	= r200StencilMaskSeparate;
+diff --git a/src/mesa/drivers/dri/r200/r200_state.h b/src/mesa/drivers/dri/r200/r200_state.h
+index a917163..1dddbfd 100644
+--- a/src/mesa/drivers/dri/r200/r200_state.h
++++ b/src/mesa/drivers/dri/r200/r200_state.h
+@@ -43,13 +43,11 @@ extern void r200InitTnlFuncs( GLcontext *ctx );
+ 
+ extern void r200UpdateMaterial( GLcontext *ctx );
+ 
+-extern void r200SetCliprects( r200ContextPtr rmesa );
+-extern void r200RecalcScissorRects( r200ContextPtr rmesa );
+ extern void r200UpdateViewportOffset( GLcontext *ctx );
+ extern void r200UpdateWindow( GLcontext *ctx );
+ extern void r200UpdateDrawBuffer(GLcontext *ctx);
+ 
+-extern void r200ValidateState( GLcontext *ctx );
++extern GLboolean r200ValidateState( GLcontext *ctx );
+ 
+ extern void r200PrintDirty( r200ContextPtr rmesa,
+ 			      const char *msg );
+@@ -59,7 +57,7 @@ extern void r200Fallback( GLcontext *ctx, GLuint bit, GLboolean mode );
+ #define FALLBACK( rmesa, bit, mode ) do {				\
+    if ( 0 ) fprintf( stderr, "FALLBACK in %s: #%d=%d\n",		\
+ 		     __FUNCTION__, bit, mode );				\
+-   r200Fallback( rmesa->glCtx, bit, mode );				\
++   r200Fallback( rmesa->radeon.glCtx, bit, mode );				\
+ } while (0)
+ 
+ extern void r200LightingSpaceChange( GLcontext *ctx );
+diff --git a/src/mesa/drivers/dri/r200/r200_state_init.c b/src/mesa/drivers/dri/r200/r200_state_init.c
+index 9e4677e..b40690e 100644
+--- a/src/mesa/drivers/dri/r200/r200_state_init.c
++++ b/src/mesa/drivers/dri/r200/r200_state_init.c
+@@ -43,6 +43,8 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ #include "tnl/t_pipeline.h"
+ #include "swrast_setup/swrast_setup.h"
+ 
++#include "radeon_common.h"
++#include "radeon_mipmap_tree.h"
+ #include "r200_context.h"
+ #include "r200_ioctl.h"
+ #include "r200_state.h"
+@@ -52,31 +54,145 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ #include "xmlpool.h"
+ 
++/* New (1.3) state mechanism.  3 commands (packet, scalar, vector) in
++ * 1.3 cmdbuffers allow all previous state to be updated as well as
++ * the tcl scalar and vector areas.
++ */
++static struct {
++	int start;
++	int len;
++	const char *name;
++} packet[RADEON_MAX_STATE_PACKETS] = {
++	{RADEON_PP_MISC, 7, "RADEON_PP_MISC"},
++	{RADEON_PP_CNTL, 3, "RADEON_PP_CNTL"},
++	{RADEON_RB3D_COLORPITCH, 1, "RADEON_RB3D_COLORPITCH"},
++	{RADEON_RE_LINE_PATTERN, 2, "RADEON_RE_LINE_PATTERN"},
++	{RADEON_SE_LINE_WIDTH, 1, "RADEON_SE_LINE_WIDTH"},
++	{RADEON_PP_LUM_MATRIX, 1, "RADEON_PP_LUM_MATRIX"},
++	{RADEON_PP_ROT_MATRIX_0, 2, "RADEON_PP_ROT_MATRIX_0"},
++	{RADEON_RB3D_STENCILREFMASK, 3, "RADEON_RB3D_STENCILREFMASK"},
++	{RADEON_SE_VPORT_XSCALE, 6, "RADEON_SE_VPORT_XSCALE"},
++	{RADEON_SE_CNTL, 2, "RADEON_SE_CNTL"},
++	{RADEON_SE_CNTL_STATUS, 1, "RADEON_SE_CNTL_STATUS"},
++	{RADEON_RE_MISC, 1, "RADEON_RE_MISC"},
++	{RADEON_PP_TXFILTER_0, 6, "RADEON_PP_TXFILTER_0"},
++	{RADEON_PP_BORDER_COLOR_0, 1, "RADEON_PP_BORDER_COLOR_0"},
++	{RADEON_PP_TXFILTER_1, 6, "RADEON_PP_TXFILTER_1"},
++	{RADEON_PP_BORDER_COLOR_1, 1, "RADEON_PP_BORDER_COLOR_1"},
++	{RADEON_PP_TXFILTER_2, 6, "RADEON_PP_TXFILTER_2"},
++	{RADEON_PP_BORDER_COLOR_2, 1, "RADEON_PP_BORDER_COLOR_2"},
++	{RADEON_SE_ZBIAS_FACTOR, 2, "RADEON_SE_ZBIAS_FACTOR"},
++	{RADEON_SE_TCL_OUTPUT_VTX_FMT, 11, "RADEON_SE_TCL_OUTPUT_VTX_FMT"},
++	{RADEON_SE_TCL_MATERIAL_EMMISSIVE_RED, 17,
++		    "RADEON_SE_TCL_MATERIAL_EMMISSIVE_RED"},
++	{R200_PP_TXCBLEND_0, 4, "R200_PP_TXCBLEND_0"},
++	{R200_PP_TXCBLEND_1, 4, "R200_PP_TXCBLEND_1"},
++	{R200_PP_TXCBLEND_2, 4, "R200_PP_TXCBLEND_2"},
++	{R200_PP_TXCBLEND_3, 4, "R200_PP_TXCBLEND_3"},
++	{R200_PP_TXCBLEND_4, 4, "R200_PP_TXCBLEND_4"},
++	{R200_PP_TXCBLEND_5, 4, "R200_PP_TXCBLEND_5"},
++	{R200_PP_TXCBLEND_6, 4, "R200_PP_TXCBLEND_6"},
++	{R200_PP_TXCBLEND_7, 4, "R200_PP_TXCBLEND_7"},
++	{R200_SE_TCL_LIGHT_MODEL_CTL_0, 6, "R200_SE_TCL_LIGHT_MODEL_CTL_0"},
++	{R200_PP_TFACTOR_0, 6, "R200_PP_TFACTOR_0"},
++	{R200_SE_VTX_FMT_0, 4, "R200_SE_VTX_FMT_0"},
++	{R200_SE_VAP_CNTL, 1, "R200_SE_VAP_CNTL"},
++	{R200_SE_TCL_MATRIX_SEL_0, 5, "R200_SE_TCL_MATRIX_SEL_0"},
++	{R200_SE_TCL_TEX_PROC_CTL_2, 5, "R200_SE_TCL_TEX_PROC_CTL_2"},
++	{R200_SE_TCL_UCP_VERT_BLEND_CTL, 1, "R200_SE_TCL_UCP_VERT_BLEND_CTL"},
++	{R200_PP_TXFILTER_0, 6, "R200_PP_TXFILTER_0"},
++	{R200_PP_TXFILTER_1, 6, "R200_PP_TXFILTER_1"},
++	{R200_PP_TXFILTER_2, 6, "R200_PP_TXFILTER_2"},
++	{R200_PP_TXFILTER_3, 6, "R200_PP_TXFILTER_3"},
++	{R200_PP_TXFILTER_4, 6, "R200_PP_TXFILTER_4"},
++	{R200_PP_TXFILTER_5, 6, "R200_PP_TXFILTER_5"},
++	{R200_PP_TXOFFSET_0, 1, "R200_PP_TXOFFSET_0"},
++	{R200_PP_TXOFFSET_1, 1, "R200_PP_TXOFFSET_1"},
++	{R200_PP_TXOFFSET_2, 1, "R200_PP_TXOFFSET_2"},
++	{R200_PP_TXOFFSET_3, 1, "R200_PP_TXOFFSET_3"},
++	{R200_PP_TXOFFSET_4, 1, "R200_PP_TXOFFSET_4"},
++	{R200_PP_TXOFFSET_5, 1, "R200_PP_TXOFFSET_5"},
++	{R200_SE_VTE_CNTL, 1, "R200_SE_VTE_CNTL"},
++	{R200_SE_TCL_OUTPUT_VTX_COMP_SEL, 1,
++	 "R200_SE_TCL_OUTPUT_VTX_COMP_SEL"},
++	{R200_PP_TAM_DEBUG3, 1, "R200_PP_TAM_DEBUG3"},
++	{R200_PP_CNTL_X, 1, "R200_PP_CNTL_X"},
++	{R200_RB3D_DEPTHXY_OFFSET, 1, "R200_RB3D_DEPTHXY_OFFSET"},
++	{R200_RE_AUX_SCISSOR_CNTL, 1, "R200_RE_AUX_SCISSOR_CNTL"},
++	{R200_RE_SCISSOR_TL_0, 2, "R200_RE_SCISSOR_TL_0"},
++	{R200_RE_SCISSOR_TL_1, 2, "R200_RE_SCISSOR_TL_1"},
++	{R200_RE_SCISSOR_TL_2, 2, "R200_RE_SCISSOR_TL_2"},
++	{R200_SE_VAP_CNTL_STATUS, 1, "R200_SE_VAP_CNTL_STATUS"},
++	{R200_SE_VTX_STATE_CNTL, 1, "R200_SE_VTX_STATE_CNTL"},
++	{R200_RE_POINTSIZE, 1, "R200_RE_POINTSIZE"},
++	{R200_SE_TCL_INPUT_VTX_VECTOR_ADDR_0, 4,
++		    "R200_SE_TCL_INPUT_VTX_VECTOR_ADDR_0"},
++	{R200_PP_CUBIC_FACES_0, 1, "R200_PP_CUBIC_FACES_0"},	/* 61 */
++	{R200_PP_CUBIC_OFFSET_F1_0, 5, "R200_PP_CUBIC_OFFSET_F1_0"}, /* 62 */
++	{R200_PP_CUBIC_FACES_1, 1, "R200_PP_CUBIC_FACES_1"},
++	{R200_PP_CUBIC_OFFSET_F1_1, 5, "R200_PP_CUBIC_OFFSET_F1_1"},
++	{R200_PP_CUBIC_FACES_2, 1, "R200_PP_CUBIC_FACES_2"},
++	{R200_PP_CUBIC_OFFSET_F1_2, 5, "R200_PP_CUBIC_OFFSET_F1_2"},
++	{R200_PP_CUBIC_FACES_3, 1, "R200_PP_CUBIC_FACES_3"},
++	{R200_PP_CUBIC_OFFSET_F1_3, 5, "R200_PP_CUBIC_OFFSET_F1_3"},
++	{R200_PP_CUBIC_FACES_4, 1, "R200_PP_CUBIC_FACES_4"},
++	{R200_PP_CUBIC_OFFSET_F1_4, 5, "R200_PP_CUBIC_OFFSET_F1_4"},
++	{R200_PP_CUBIC_FACES_5, 1, "R200_PP_CUBIC_FACES_5"},
++	{R200_PP_CUBIC_OFFSET_F1_5, 5, "R200_PP_CUBIC_OFFSET_F1_5"},
++	{RADEON_PP_TEX_SIZE_0, 2, "RADEON_PP_TEX_SIZE_0"},
++	{RADEON_PP_TEX_SIZE_1, 2, "RADEON_PP_TEX_SIZE_1"},
++	{RADEON_PP_TEX_SIZE_2, 2, "RADEON_PP_TEX_SIZE_2"},
++	{R200_RB3D_BLENDCOLOR, 3, "R200_RB3D_BLENDCOLOR"},
++	{R200_SE_TCL_POINT_SPRITE_CNTL, 1, "R200_SE_TCL_POINT_SPRITE_CNTL"},
++	{RADEON_PP_CUBIC_FACES_0, 1, "RADEON_PP_CUBIC_FACES_0"},
++	{RADEON_PP_CUBIC_OFFSET_T0_0, 5, "RADEON_PP_CUBIC_OFFSET_T0_0"},
++	{RADEON_PP_CUBIC_FACES_1, 1, "RADEON_PP_CUBIC_FACES_1"},
++	{RADEON_PP_CUBIC_OFFSET_T1_0, 5, "RADEON_PP_CUBIC_OFFSET_T1_0"},
++	{RADEON_PP_CUBIC_FACES_2, 1, "RADEON_PP_CUBIC_FACES_2"},
++	{RADEON_PP_CUBIC_OFFSET_T2_0, 5, "RADEON_PP_CUBIC_OFFSET_T2_0"},
++	{R200_PP_TRI_PERF, 2, "R200_PP_TRI_PERF"},
++	{R200_PP_TXCBLEND_8, 32, "R200_PP_AFS_0"},     /* 85 */
++	{R200_PP_TXCBLEND_0, 32, "R200_PP_AFS_1"},
++	{R200_PP_TFACTOR_0, 8, "R200_ATF_TFACTOR"},
++	{R200_PP_TXFILTER_0, 8, "R200_PP_TXCTLALL_0"},
++	{R200_PP_TXFILTER_1, 8, "R200_PP_TXCTLALL_1"},
++	{R200_PP_TXFILTER_2, 8, "R200_PP_TXCTLALL_2"},
++	{R200_PP_TXFILTER_3, 8, "R200_PP_TXCTLALL_3"},
++	{R200_PP_TXFILTER_4, 8, "R200_PP_TXCTLALL_4"},
++	{R200_PP_TXFILTER_5, 8, "R200_PP_TXCTLALL_5"},
++	{R200_VAP_PVS_CNTL_1, 2, "R200_VAP_PVS_CNTL"},
++};
 +
-+static inline void _radeon_bo_ref(struct radeon_bo *bo,
-+                                  const char *file,
-+                                  const char *func,
-+                                  int line)
+ /* =============================================================
+  * State initialization
+  */
+ 
+ void r200PrintDirty( r200ContextPtr rmesa, const char *msg )
+ {
+-   struct r200_state_atom *l;
++   struct radeon_state_atom *l;
+ 
+    fprintf(stderr, msg);
+    fprintf(stderr, ": ");
+ 
+-   foreach(l, &rmesa->hw.atomlist) {
+-      if (l->dirty || rmesa->hw.all_dirty)
++   foreach(l, &rmesa->radeon.hw.atomlist) {
++      if (l->dirty || rmesa->radeon.hw.all_dirty)
+ 	 fprintf(stderr, "%s, ", l->name);
+    }
+ 
+    fprintf(stderr, "\n");
+ }
+ 
+-static int cmdpkt( int id ) 
++static int cmdpkt( r200ContextPtr rmesa, int id ) 
+ {
+    drm_radeon_cmd_header_t h;
+-   h.i = 0;
+-   h.packet.cmd_type = RADEON_CMD_PACKET;
+-   h.packet.packet_id = id;
++
++   if (rmesa->radeon.radeonScreen->kernel_mm) {
++     return CP_PACKET0(packet[id].start, packet[id].len - 1);
++   } else {
++     h.i = 0;
++     h.packet.cmd_type = RADEON_CMD_PACKET;
++     h.packet.packet_id = id;
++   }
+    return h.i;
+ }
+ 
+@@ -127,96 +243,388 @@ static int cmdscl2( int offset, int stride, int count )
+ }
+ 
+ #define CHECK( NM, FLAG )				\
+-static GLboolean check_##NM( GLcontext *ctx, int idx )	\
++static int check_##NM( GLcontext *ctx, struct radeon_state_atom *atom) \
+ {							\
+    r200ContextPtr rmesa = R200_CONTEXT(ctx);		\
+-   (void) idx;						\
+    (void) rmesa;					\
+-   return FLAG;						\
++   return (FLAG) ? atom->cmd_size : 0;			\
+ }
+ 
+ #define TCL_CHECK( NM, FLAG )				\
+-static GLboolean check_##NM( GLcontext *ctx, int idx )	\
+-{							\
+-   r200ContextPtr rmesa = R200_CONTEXT(ctx);		\
+-   (void) idx;						\
+-   return !rmesa->TclFallback && !ctx->VertexProgram._Enabled && (FLAG);	\
++static int check_##NM( GLcontext *ctx, struct radeon_state_atom *atom) \
++{									\
++   r200ContextPtr rmesa = R200_CONTEXT(ctx);				\
++   return (!rmesa->radeon.TclFallback && !ctx->VertexProgram._Enabled && (FLAG)) ? atom->cmd_size : 0; \
+ }
+ 
+ #define TCL_OR_VP_CHECK( NM, FLAG )			\
+-static GLboolean check_##NM( GLcontext *ctx, int idx )	\
++static int check_##NM( GLcontext *ctx, struct radeon_state_atom *atom ) \
+ {							\
+    r200ContextPtr rmesa = R200_CONTEXT(ctx);		\
+-   (void) idx;						\
+-   return !rmesa->TclFallback && (FLAG);		\
++   return (!rmesa->radeon.TclFallback && (FLAG)) ? atom->cmd_size : 0;	\
+ }
+ 
+ #define VP_CHECK( NM, FLAG )				\
+-static GLboolean check_##NM( GLcontext *ctx, int idx )	\
+-{							\
+-   r200ContextPtr rmesa = R200_CONTEXT(ctx);		\
+-   (void) idx;						\
+-   return !rmesa->TclFallback && ctx->VertexProgram._Enabled && (FLAG);		\
++static int check_##NM( GLcontext *ctx, struct radeon_state_atom *atom ) \
++{									\
++   r200ContextPtr rmesa = R200_CONTEXT(ctx);				\
++   (void) atom;								\
++   return (!rmesa->radeon.TclFallback && ctx->VertexProgram._Enabled && (FLAG)) ? atom->cmd_size : 0; \
+ }
+ 
+-
+ CHECK( always, GL_TRUE )
+ CHECK( never, GL_FALSE )
+ CHECK( tex_any, ctx->Texture._EnabledUnits )
+ CHECK( tf, (ctx->Texture._EnabledUnits && !ctx->ATIFragmentShader._Enabled) );
+-CHECK( tex_pair, (rmesa->state.texture.unit[idx].unitneeded | rmesa->state.texture.unit[idx & ~1].unitneeded) )
+-CHECK( tex, rmesa->state.texture.unit[idx].unitneeded )
++CHECK( tex_pair, (rmesa->state.texture.unit[atom->idx].unitneeded | rmesa->state.texture.unit[atom->idx & ~1].unitneeded) )
++CHECK( tex, rmesa->state.texture.unit[atom->idx].unitneeded )
+ CHECK( pix_zero, !ctx->ATIFragmentShader._Enabled )
+-CHECK( texenv, (rmesa->state.envneeded & (1 << idx) && !ctx->ATIFragmentShader._Enabled) )
++   CHECK( texenv, (rmesa->state.envneeded & (1 << (atom->idx)) && !ctx->ATIFragmentShader._Enabled) )
+ CHECK( afs_pass1, (ctx->ATIFragmentShader._Enabled && (ctx->ATIFragmentShader.Current->NumPasses > 1)) )
+ CHECK( afs, ctx->ATIFragmentShader._Enabled )
+-CHECK( tex_cube, rmesa->state.texture.unit[idx].unitneeded & TEXTURE_CUBE_BIT )
++CHECK( tex_cube, rmesa->state.texture.unit[atom->idx].unitneeded & TEXTURE_CUBE_BIT )
+ TCL_CHECK( tcl_fog, ctx->Fog.Enabled )
+ TCL_CHECK( tcl, GL_TRUE )
+-TCL_CHECK( tcl_tex, rmesa->state.texture.unit[idx].unitneeded )
++TCL_CHECK( tcl_tex, rmesa->state.texture.unit[atom->idx].unitneeded )
+ TCL_CHECK( tcl_lighting, ctx->Light.Enabled )
+-TCL_CHECK( tcl_light, ctx->Light.Enabled && ctx->Light.Light[idx].Enabled )
+-TCL_OR_VP_CHECK( tcl_ucp, (ctx->Transform.ClipPlanesEnabled & (1 << idx)) )
++TCL_CHECK( tcl_light, ctx->Light.Enabled && ctx->Light.Light[atom->idx].Enabled )
++TCL_OR_VP_CHECK( tcl_ucp, (ctx->Transform.ClipPlanesEnabled & (1 << (atom->idx))) )
+ TCL_OR_VP_CHECK( tcl_or_vp, GL_TRUE )
+ VP_CHECK( tcl_vp, GL_TRUE )
+ VP_CHECK( tcl_vp_size, ctx->VertexProgram.Current->Base.NumNativeInstructions > 64 )
+ VP_CHECK( tcl_vpp_size, ctx->VertexProgram.Current->Base.NumNativeParameters > 96 )
+ 
++#define OUT_VEC(hdr, data) do {			\
++    drm_radeon_cmd_header_t h;					\
++    h.i = hdr;								\
++    OUT_BATCH(CP_PACKET0(RADEON_SE_TCL_STATE_FLUSH, 0));		\
++    OUT_BATCH(0);							\
++    OUT_BATCH(CP_PACKET0(R200_SE_TCL_VECTOR_INDX_REG, 0));		\
++    OUT_BATCH(h.vectors.offset | (h.vectors.stride << RADEON_VEC_INDX_OCTWORD_STRIDE_SHIFT)); \
++    OUT_BATCH(CP_PACKET0_ONE(R200_SE_TCL_VECTOR_DATA_REG, h.vectors.count - 1));	\
++    OUT_BATCH_TABLE((data), h.vectors.count);				\
++  } while(0)
++
++#define OUT_VECLINEAR(hdr, data) do {			\
++    drm_radeon_cmd_header_t h;					\
++    uint32_t _start = h.veclinear.addr_lo | (h.veclinear.addr_hi << 8);	\
++    uint32_t _sz = h.veclinear.count * 4;				\
++    h.i = hdr;								\
++    OUT_BATCH(CP_PACKET0(RADEON_SE_TCL_STATE_FLUSH, 0));		\
++    OUT_BATCH(0);							\
++    OUT_BATCH(CP_PACKET0(R200_SE_TCL_VECTOR_INDX_REG, 0));		\
++    OUT_BATCH(_start | (1 << RADEON_VEC_INDX_OCTWORD_STRIDE_SHIFT));	\
++    OUT_BATCH(CP_PACKET0_ONE(R200_SE_TCL_VECTOR_DATA_REG, _sz - 1));	\
++    OUT_BATCH_TABLE((data), _sz);					\
++  } while(0)
++
++#define OUT_SCL(hdr, data) do {					\
++    drm_radeon_cmd_header_t h;						\
++    h.i = hdr;								\
++    OUT_BATCH(CP_PACKET0(R200_SE_TCL_SCALAR_INDX_REG, 0));		\
++    OUT_BATCH((h.scalars.offset) | (h.scalars.stride << RADEON_SCAL_INDX_DWORD_STRIDE_SHIFT)); \
++    OUT_BATCH(CP_PACKET0_ONE(R200_SE_TCL_SCALAR_DATA_REG, h.scalars.count - 1));	\
++    OUT_BATCH_TABLE((data), h.scalars.count);				\
++  } while(0)
++
++#define OUT_SCL2(hdr, data) do {					\
++    drm_radeon_cmd_header_t h;						\
++    h.i = hdr;								\
++    OUT_BATCH(CP_PACKET0(R200_SE_TCL_SCALAR_INDX_REG, 0));		\
++    OUT_BATCH((h.scalars.offset + 0x100) | (h.scalars.stride << RADEON_SCAL_INDX_DWORD_STRIDE_SHIFT)); \
++    OUT_BATCH(CP_PACKET0_ONE(R200_SE_TCL_SCALAR_DATA_REG, h.scalars.count - 1));	\
++    OUT_BATCH_TABLE((data), h.scalars.count);				\
++  } while(0)
++
++static void mtl_emit(GLcontext *ctx, struct radeon_state_atom *atom)
 +{
-+    bo->cref++;
-+#ifdef RADEON_BO_TRACK
-+    radeon_track_add_event(bo->track, file, func, "ref", line); 
-+#endif
-+    bo->bom->funcs->bo_ref(bo);
++   r200ContextPtr r200 = R200_CONTEXT(ctx);
++   BATCH_LOCALS(&r200->radeon);
++   uint32_t dwords = atom->cmd_size;
++
++   dwords += 6;
++   BEGIN_BATCH_NO_AUTOSTATE(dwords);
++   OUT_VEC(atom->cmd[MTL_CMD_0], (atom->cmd+1));
++   OUT_SCL2(atom->cmd[MTL_CMD_1], (atom->cmd + 18));
++   END_BATCH();
 +}
 +
-+static inline struct radeon_bo *_radeon_bo_unref(struct radeon_bo *bo,
-+                                                 const char *file,
-+                                                 const char *func,
-+                                                 int line)
++static void lit_emit(GLcontext *ctx, struct radeon_state_atom *atom)
 +{
-+    bo->cref--;
-+#ifdef RADEON_BO_TRACK
-+    radeon_track_add_event(bo->track, file, func, "unref", line);
-+    if (bo->cref <= 0) {
-+        radeon_tracker_remove_track(&bo->bom->tracker, bo->track);
-+        bo->track = NULL;
-+    }
-+#endif
-+    return bo->bom->funcs->bo_unref(bo);
++   r200ContextPtr r200 = R200_CONTEXT(ctx);
++   BATCH_LOCALS(&r200->radeon);
++   uint32_t dwords = atom->cmd_size;
++
++   dwords += 8;
++   BEGIN_BATCH_NO_AUTOSTATE(dwords);
++   OUT_VEC(atom->cmd[LIT_CMD_0], atom->cmd+1);
++   OUT_VEC(atom->cmd[LIT_CMD_1], atom->cmd+LIT_CMD_1+1);
++   END_BATCH();
 +}
 +
-+static inline int _radeon_bo_map(struct radeon_bo *bo,
-+                                 int write,
-+                                 const char *file,
-+                                 const char *func,
-+                                 int line)
++static void ptp_emit(GLcontext *ctx, struct radeon_state_atom *atom)
 +{
-+    return bo->bom->funcs->bo_map(bo, write);
++   r200ContextPtr r200 = R200_CONTEXT(ctx);
++   BATCH_LOCALS(&r200->radeon);
++   uint32_t dwords = atom->cmd_size;
++
++   dwords += 8;
++   BEGIN_BATCH_NO_AUTOSTATE(dwords);
++   OUT_VEC(atom->cmd[PTP_CMD_0], atom->cmd+1);
++   OUT_VEC(atom->cmd[PTP_CMD_1], atom->cmd+PTP_CMD_1+1);
++   END_BATCH();
 +}
 +
-+static inline int _radeon_bo_unmap(struct radeon_bo *bo,
-+                                   const char *file,
-+                                   const char *func,
-+                                   int line)
++static void veclinear_emit(GLcontext *ctx, struct radeon_state_atom *atom)
 +{
-+    return bo->bom->funcs->bo_unmap(bo);
++   r200ContextPtr r200 = R200_CONTEXT(ctx);
++   BATCH_LOCALS(&r200->radeon);
++   uint32_t dwords = atom->cmd_size;
++
++   dwords += 4;
++   BEGIN_BATCH_NO_AUTOSTATE(dwords);
++   OUT_VECLINEAR(atom->cmd[0], atom->cmd+1);
++   END_BATCH();
 +}
 +
-+static inline int _radeon_bo_wait(struct radeon_bo *bo,
-+                                  const char *file,
-+                                  const char *func,
-+                                  int line)
++static void scl_emit(GLcontext *ctx, struct radeon_state_atom *atom)
 +{
-+    return bo->bom->funcs->bo_wait(bo);
++   r200ContextPtr r200 = R200_CONTEXT(ctx);
++   BATCH_LOCALS(&r200->radeon);
++   uint32_t dwords = atom->cmd_size;
++
++   dwords += 2;
++   BEGIN_BATCH_NO_AUTOSTATE(dwords);
++   OUT_SCL(atom->cmd[0], atom->cmd+1);
++   END_BATCH();
 +}
 +
-+#define radeon_bo_open(bom, h, s, a, d, f)\
-+    _radeon_bo_open(bom, h, s, a, d, f, __FILE__, __FUNCTION__, __LINE__)
-+#define radeon_bo_ref(bo)\
-+    _radeon_bo_ref(bo, __FILE__, __FUNCTION__, __LINE__)
-+#define radeon_bo_unref(bo)\
-+    _radeon_bo_unref(bo, __FILE__, __FUNCTION__, __LINE__)
-+#define radeon_bo_map(bo, w)\
-+    _radeon_bo_map(bo, w, __FILE__, __FUNCTION__, __LINE__)
-+#define radeon_bo_unmap(bo)\
-+    _radeon_bo_unmap(bo, __FILE__, __FUNCTION__, __LINE__)
-+#define radeon_bo_debug(bo, opcode)\
-+    _radeon_bo_debug(bo, opcode, __FILE__, __FUNCTION__, __LINE__)
-+#define radeon_bo_wait(bo) \
-+    _radeon_bo_wait(bo, __FILE__, __func__, __LINE__)
 +
-+#endif
-diff --git a/src/mesa/drivers/dri/radeon/radeon_bo_legacy.c b/src/mesa/drivers/dri/radeon/radeon_bo_legacy.c
-new file mode 100644
-index 0000000..03a6299
---- /dev/null
-+++ b/src/mesa/drivers/dri/radeon/radeon_bo_legacy.c
-@@ -0,0 +1,825 @@
-+/* 
-+ * Copyright © 2008 Nicolai Haehnle
-+ * Copyright © 2008 Dave Airlie
-+ * Copyright © 2008 Jérôme Glisse
-+ * All Rights Reserved.
-+ * 
-+ * Permission is hereby granted, free of charge, to any person obtaining a
-+ * copy of this software and associated documentation files (the
-+ * "Software"), to deal in the Software without restriction, including
-+ * without limitation the rights to use, copy, modify, merge, publish,
-+ * distribute, sub license, and/or sell copies of the Software, and to
-+ * permit persons to whom the Software is furnished to do so, subject to
-+ * the following conditions:
-+ * 
-+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
-+ * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
-+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR 
-+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE 
-+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
-+ *
-+ * The above copyright notice and this permission notice (including the
-+ * next paragraph) shall be included in all copies or substantial portions
-+ * of the Software.
-+ */
-+/*
-+ * Authors:
-+ *      Aapo Tahkola <aet@rasterburn.org>
-+ *      Nicolai Haehnle <prefect_@gmx.net>
-+ *      Dave Airlie
-+ *      Jérôme Glisse <glisse@freedesktop.org>
-+ */
-+#include <stdio.h>
-+#include <stddef.h>
-+#include <stdint.h>
-+#include <stdlib.h>
-+#include <string.h>
-+#include <errno.h>
-+#include <unistd.h>
-+#include <sys/mman.h>
-+#include <sys/ioctl.h>
-+#include "xf86drm.h"
-+#include "texmem.h"
-+#include "main/simple_list.h"
++static void vec_emit(GLcontext *ctx, struct radeon_state_atom *atom)
++{
++   r200ContextPtr r200 = R200_CONTEXT(ctx);
++   BATCH_LOCALS(&r200->radeon);
++   uint32_t dwords = atom->cmd_size;
++
++   dwords += 4;
++   BEGIN_BATCH_NO_AUTOSTATE(dwords);
++   OUT_VEC(atom->cmd[0], atom->cmd+1);
++   END_BATCH();
++}
 +
-+#include "drm.h"
-+#include "radeon_drm.h"
-+#include "radeon_common.h"
-+#include "radeon_bocs_wrapper.h"
++static void ctx_emit(GLcontext *ctx, struct radeon_state_atom *atom)
++{
++   r200ContextPtr r200 = R200_CONTEXT(ctx);
++   BATCH_LOCALS(&r200->radeon);
++   struct radeon_renderbuffer *rrb;
++   uint32_t cbpitch;
++   uint32_t zbpitch, depth_fmt;
++   uint32_t dwords = atom->cmd_size;
++
++   /* output the first 7 bytes of context */
++   BEGIN_BATCH_NO_AUTOSTATE(dwords+2+2);
++   OUT_BATCH_TABLE(atom->cmd, 5);
++
++   rrb = radeon_get_depthbuffer(&r200->radeon);
++   if (!rrb) {
++     OUT_BATCH(0);
++     OUT_BATCH(0);
++   } else {
++     zbpitch = (rrb->pitch / rrb->cpp);
++     if (r200->using_hyperz)
++       zbpitch |= RADEON_DEPTH_HYPERZ;
++     OUT_BATCH_RELOC(0, rrb->bo, 0, 0, RADEON_GEM_DOMAIN_VRAM, 0);
++     OUT_BATCH(zbpitch);
++     if (rrb->cpp == 4) 
++       depth_fmt = RADEON_DEPTH_FORMAT_24BIT_INT_Z; 
++     else 
++       depth_fmt = RADEON_DEPTH_FORMAT_16BIT_INT_Z; 
++     atom->cmd[CTX_RB3D_ZSTENCILCNTL] &= ~RADEON_DEPTH_FORMAT_MASK; 
++     atom->cmd[CTX_RB3D_ZSTENCILCNTL] |= depth_fmt; 
++   }
++     
++   OUT_BATCH(atom->cmd[CTX_RB3D_ZSTENCILCNTL]);
++   OUT_BATCH(atom->cmd[CTX_CMD_1]);
++   OUT_BATCH(atom->cmd[CTX_PP_CNTL]);
++
++   rrb = radeon_get_colorbuffer(&r200->radeon);
++   if (!rrb || !rrb->bo) {
++     OUT_BATCH(atom->cmd[CTX_RB3D_CNTL]);
++     OUT_BATCH(atom->cmd[CTX_RB3D_COLOROFFSET]);
++   } else {
++     atom->cmd[CTX_RB3D_CNTL] &= ~(0xf << 10); 
++     if (rrb->cpp == 4) 
++       atom->cmd[CTX_RB3D_CNTL] |= RADEON_COLOR_FORMAT_ARGB8888; 
++     else 
++       atom->cmd[CTX_RB3D_CNTL] |= RADEON_COLOR_FORMAT_RGB565; 
++ 
++     OUT_BATCH(atom->cmd[CTX_RB3D_CNTL]); 
++     OUT_BATCH_RELOC(0, rrb->bo, 0, 0, RADEON_GEM_DOMAIN_VRAM, 0);
++   }
 +
-+/* no seriously texmem.c is this screwed up */
-+struct bo_legacy_texture_object {
-+    driTextureObject    base;
-+    struct bo_legacy *parent;
-+};
++   OUT_BATCH(atom->cmd[CTX_CMD_2]);
 +
-+struct bo_legacy {
-+    struct radeon_bo    base;
-+    int                 map_count;
-+    uint32_t            pending;
-+    int                 is_pending;
-+    int                 static_bo;
-+    uint32_t            offset;
-+    struct bo_legacy_texture_object *tobj;
-+    int                 validated;
-+    int                 dirty;
-+    void                *ptr;
-+    struct bo_legacy    *next, *prev;
-+    struct bo_legacy    *pnext, *pprev;
-+};
++   if (!rrb || !rrb->bo) {
++     OUT_BATCH(atom->cmd[CTX_RB3D_COLORPITCH]);
++   } else {
++     cbpitch = (rrb->pitch / rrb->cpp);
++     if (rrb->bo->flags & RADEON_BO_FLAGS_MACRO_TILE)
++       cbpitch |= R200_COLOR_TILE_ENABLE;
++     OUT_BATCH(cbpitch);
++   }
 +
-+struct bo_manager_legacy {
-+    struct radeon_bo_manager    base;
-+    unsigned                    nhandle;
-+    unsigned                    nfree_handles;
-+    unsigned                    cfree_handles;
-+    uint32_t                    current_age;
-+    struct bo_legacy            bos;
-+    struct bo_legacy            pending_bos;
-+    uint32_t                    fb_location;
-+    uint32_t                    texture_offset;
-+    unsigned                    dma_alloc_size;
-+    uint32_t                    dma_buf_count;
-+    unsigned                    cpendings;
-+    driTextureObject            texture_swapped;
-+    driTexHeap                  *texture_heap;
-+    struct radeon_screen        *screen;
-+    unsigned                    *free_handles;
-+};
++   if (atom->cmd_size == CTX_STATE_SIZE_NEWDRM)
++     OUT_BATCH_TABLE((atom->cmd + 14), 4);
 +
-+static void bo_legacy_tobj_destroy(void *data, driTextureObject *t)
-+{
-+    struct bo_legacy_texture_object *tobj = (struct bo_legacy_texture_object *)t;
-+    
-+    if (tobj->parent) {
-+        tobj->parent->tobj = NULL;
-+        tobj->parent->validated = 0;
-+    }
++   END_BATCH();
 +}
 +
-+static void inline clean_handles(struct bo_manager_legacy *bom)
++static void ctx_emit_cs(GLcontext *ctx, struct radeon_state_atom *atom)
 +{
-+  while (bom->cfree_handles > 0 &&
-+	 !bom->free_handles[bom->cfree_handles - 1])
-+    bom->cfree_handles--;
++   r200ContextPtr r200 = R200_CONTEXT(ctx);
++   BATCH_LOCALS(&r200->radeon);
++   struct radeon_renderbuffer *rrb, *drb;
++   uint32_t cbpitch = 0;
++   uint32_t zbpitch = 0;
++   uint32_t dwords = atom->cmd_size;
++   uint32_t depth_fmt;
++
++   rrb = radeon_get_colorbuffer(&r200->radeon);
++   if (!rrb || !rrb->bo) {
++      return;
++   }
 +
-+}
-+static int legacy_new_handle(struct bo_manager_legacy *bom, uint32_t *handle)
-+{
-+    uint32_t tmp;
++   atom->cmd[CTX_RB3D_CNTL] &= ~(0xf << 10);
++   if (rrb->cpp == 4)
++	atom->cmd[CTX_RB3D_CNTL] |= RADEON_COLOR_FORMAT_ARGB8888;
++   else
++	atom->cmd[CTX_RB3D_CNTL] |= RADEON_COLOR_FORMAT_RGB565;
++
++   cbpitch = (rrb->pitch / rrb->cpp);
++   if (rrb->bo->flags & RADEON_BO_FLAGS_MACRO_TILE)
++       cbpitch |= R200_COLOR_TILE_ENABLE;
++
++   drb = radeon_get_depthbuffer(&r200->radeon);
++   if (drb) {
++     zbpitch = (drb->pitch / drb->cpp);
++     if (drb->cpp == 4)
++        depth_fmt = RADEON_DEPTH_FORMAT_24BIT_INT_Z;
++     else
++        depth_fmt = RADEON_DEPTH_FORMAT_16BIT_INT_Z;
++     atom->cmd[CTX_RB3D_ZSTENCILCNTL] &= ~RADEON_DEPTH_FORMAT_MASK;
++     atom->cmd[CTX_RB3D_ZSTENCILCNTL] |= depth_fmt;
++   }
 +
-+    *handle = 0;
-+    if (bom->nhandle == 0xFFFFFFFF) {
-+        return -EINVAL;
-+    }
-+    if (bom->cfree_handles > 0) {
-+        tmp = bom->free_handles[--bom->cfree_handles];
-+	clean_handles(bom);
-+    } else {
-+        bom->cfree_handles = 0;
-+        tmp = bom->nhandle++;
-+    }
-+    assert(tmp);
-+    *handle = tmp;
-+    return 0;
-+}
++   if (drb)
++     dwords += 4;
++   if (rrb)
++     dwords += 4;
 +
-+static int legacy_free_handle(struct bo_manager_legacy *bom, uint32_t handle)
-+{
-+    uint32_t *handles;
++   /* output the first 7 bytes of context */
++   BEGIN_BATCH_NO_AUTOSTATE(dwords);
 +
-+    if (!handle) {
-+        return 0;
-+    }
-+    if (handle == (bom->nhandle - 1)) {
-+        int i;
++   /* In the CS case we need to split this up */
++   OUT_BATCH(CP_PACKET0(packet[0].start, 3));
++   OUT_BATCH_TABLE((atom->cmd + 1), 4);
 +
-+        bom->nhandle--;
-+        for (i = bom->cfree_handles - 1; i >= 0; i--) {
-+            if (bom->free_handles[i] == (bom->nhandle - 1)) {
-+                bom->nhandle--;
-+                bom->free_handles[i] = 0;
-+            }
-+        }
-+        clean_handles(bom);
-+        return 0;
-+    }
-+    if (bom->cfree_handles < bom->nfree_handles) {
-+        bom->free_handles[bom->cfree_handles++] = handle;
-+        return 0;
-+    }
-+    bom->nfree_handles += 0x100;
-+    handles = (uint32_t*)realloc(bom->free_handles, bom->nfree_handles * 4);
-+    if (handles == NULL) {
-+        bom->nfree_handles -= 0x100;
-+        return -ENOMEM;
-+    }
-+    bom->free_handles = handles;
-+    bom->free_handles[bom->cfree_handles++] = handle;
-+    return 0;
-+}
++   if (drb) {
++     OUT_BATCH(CP_PACKET0(RADEON_RB3D_DEPTHOFFSET, 0));
++     OUT_BATCH_RELOC(0, rrb->bo, 0, 0, RADEON_GEM_DOMAIN_VRAM, 0);
 +
-+static void legacy_get_current_age(struct bo_manager_legacy *boml)
-+{
-+    drm_radeon_getparam_t gp;
-+    int r;
++     OUT_BATCH(CP_PACKET0(RADEON_RB3D_DEPTHPITCH, 0));
++     OUT_BATCH(zbpitch);
++   }
 +
-+    if (IS_R300_CLASS(boml->screen)) {
-+    	gp.param = RADEON_PARAM_LAST_CLEAR;
-+    	gp.value = (int *)&boml->current_age;
-+    	r = drmCommandWriteRead(boml->base.fd, DRM_RADEON_GETPARAM,
-+       	                     &gp, sizeof(gp));
-+    	if (r) {
-+       	 fprintf(stderr, "%s: drmRadeonGetParam: %d\n", __FUNCTION__, r);
-+         exit(1);
-+       }
-+    } else
-+	boml->current_age = boml->screen->scratch[3];
-+}
++   OUT_BATCH(CP_PACKET0(RADEON_RB3D_ZSTENCILCNTL, 0));
++   OUT_BATCH(atom->cmd[CTX_RB3D_ZSTENCILCNTL]);
++   OUT_BATCH(CP_PACKET0(RADEON_PP_CNTL, 1));
++   OUT_BATCH(atom->cmd[CTX_PP_CNTL]);
++   OUT_BATCH(atom->cmd[CTX_RB3D_CNTL]);
 +
-+static int legacy_is_pending(struct radeon_bo *bo)
-+{
-+    struct bo_manager_legacy *boml = (struct bo_manager_legacy *)bo->bom;
-+    struct bo_legacy *bo_legacy = (struct bo_legacy*)bo;
 +
-+    if (bo_legacy->is_pending <= 0) {
-+        bo_legacy->is_pending = 0;
-+        return 0;
-+    }
-+    if (boml->current_age >= bo_legacy->pending) {
-+        if (boml->pending_bos.pprev == bo_legacy) {
-+            boml->pending_bos.pprev = bo_legacy->pprev;
-+        }
-+        bo_legacy->pprev->pnext = bo_legacy->pnext;
-+        if (bo_legacy->pnext) {
-+            bo_legacy->pnext->pprev = bo_legacy->pprev;
-+        }
-+	assert(bo_legacy->is_pending <= bo->cref);
-+        while (bo_legacy->is_pending--) {
-+	    bo = radeon_bo_unref(bo);
-+	    if (!bo)
-+	      break;
-+        }
-+	if (bo)
-+	  bo_legacy->is_pending = 0;
-+        boml->cpendings--;
-+        return 0;
-+    }
-+    return 1;
-+}
++   if (rrb) {
++     OUT_BATCH(CP_PACKET0(RADEON_RB3D_COLOROFFSET, 0));
++     OUT_BATCH_RELOC(0, rrb->bo, 0, 0, RADEON_GEM_DOMAIN_VRAM, 0);
++   }
 +
-+static int legacy_wait_pending(struct radeon_bo *bo)
-+{
-+    struct bo_manager_legacy *boml = (struct bo_manager_legacy *)bo->bom;
-+    struct bo_legacy *bo_legacy = (struct bo_legacy*)bo;
++   if (rrb) {
++     OUT_BATCH(CP_PACKET0(RADEON_RB3D_COLORPITCH, 0));
++     OUT_BATCH(cbpitch);
++   }
 +
-+    if (!bo_legacy->is_pending) {
-+        return 0;
-+    }
-+    /* FIXME: lockup and userspace busy looping that's all the folks */
-+    legacy_get_current_age(boml);
-+    while (legacy_is_pending(bo)) {
-+        usleep(10);
-+        legacy_get_current_age(boml);
-+    }
-+    return 0;
++   if (atom->cmd_size == CTX_STATE_SIZE_NEWDRM) {
++     OUT_BATCH_TABLE((atom->cmd + 14), 4);
++   }
++
++   END_BATCH();
 +}
 +
-+static void legacy_track_pending(struct bo_manager_legacy *boml, int debug)
++static void tex_emit(GLcontext *ctx, struct radeon_state_atom *atom)
 +{
-+    struct bo_legacy *bo_legacy;
-+    struct bo_legacy *next;
-+
-+    legacy_get_current_age(boml);
-+    bo_legacy = boml->pending_bos.pnext;
-+    while (bo_legacy) {
-+        if (debug)
-+	  fprintf(stderr,"pending %p %d %d %d\n", bo_legacy, bo_legacy->base.size,
-+		  boml->current_age, bo_legacy->pending);
-+        next = bo_legacy->pnext;
-+        if (legacy_is_pending(&(bo_legacy->base))) {
-+        }
-+        bo_legacy = next;
-+    } 
++   r200ContextPtr r200 = R200_CONTEXT(ctx);
++   BATCH_LOCALS(&r200->radeon);
++   uint32_t dwords = atom->cmd_size;
++   int i = atom->idx;
++   radeonTexObj *t = r200->state.texture.unit[i].texobj;
++
++   if (t && t->mt && !t->image_override)
++     dwords += 2;
++   BEGIN_BATCH_NO_AUTOSTATE(dwords);
++   OUT_BATCH_TABLE(atom->cmd, 10);
++   if (t && !t->image_override) {
++     OUT_BATCH_RELOC(t->tile_bits, t->mt->bo, 0,
++		     RADEON_GEM_DOMAIN_VRAM, 0, 0);
++   } else if (!t) {
++     /* workaround for old CS mechanism */
++     OUT_BATCH(r200->radeon.radeonScreen->texOffset[RADEON_LOCAL_TEX_HEAP]);
++   } else if (t->image_override)
++     OUT_BATCH(t->override_offset);
++
++   END_BATCH();
 +}
 +
-+static int legacy_wait_any_pending(struct bo_manager_legacy *boml)
++static void cube_emit(GLcontext *ctx, struct radeon_state_atom *atom)
 +{
-+    struct bo_legacy *bo_legacy;
-+
-+    legacy_get_current_age(boml);
-+    bo_legacy = boml->pending_bos.pnext;
-+    if (!bo_legacy)
-+      return -1;
-+    legacy_wait_pending(&bo_legacy->base);
-+    return 0;
++   r200ContextPtr r200 = R200_CONTEXT(ctx);
++   BATCH_LOCALS(&r200->radeon);
++   uint32_t dwords = atom->cmd_size;
++   int i = atom->idx;
++   radeonTexObj *t = r200->state.texture.unit[i].texobj;
++   GLuint size;
++
++   BEGIN_BATCH_NO_AUTOSTATE(dwords + (2 * 5));
++   OUT_BATCH_TABLE(atom->cmd, 3);
++
++   if (t && !t->image_override) {
++     size = t->mt->totalsize / 6;
++     OUT_BATCH_RELOC(0, t->mt->bo, size, RADEON_GEM_DOMAIN_VRAM, 0, 0);
++     OUT_BATCH_RELOC(0, t->mt->bo, size * 2, RADEON_GEM_DOMAIN_VRAM, 0, 0);
++     OUT_BATCH_RELOC(0, t->mt->bo, size * 3, RADEON_GEM_DOMAIN_VRAM, 0, 0);
++     OUT_BATCH_RELOC(0, t->mt->bo, size * 4, RADEON_GEM_DOMAIN_VRAM, 0, 0);
++     OUT_BATCH_RELOC(0, t->mt->bo, size * 5, RADEON_GEM_DOMAIN_VRAM, 0, 0);
++   }
++   END_BATCH();
 +}
+ 
+ /* Initialize the context's hardware state.
+  */
+ void r200InitState( r200ContextPtr rmesa )
+ {
+-   GLcontext *ctx = rmesa->glCtx;
+-   GLuint color_fmt, depth_fmt, i;
+-   GLint drawPitch, drawOffset;
+-
+-   switch ( rmesa->r200Screen->cpp ) {
+-   case 2:
+-      color_fmt = R200_COLOR_FORMAT_RGB565;
+-      break;
+-   case 4:
+-      color_fmt = R200_COLOR_FORMAT_ARGB8888;
+-      break;
+-   default:
+-      fprintf( stderr, "Error: Unsupported pixel depth... exiting\n" );
+-      exit( -1 );
+-   }
++   GLcontext *ctx = rmesa->radeon.glCtx;
++   GLuint i;
+ 
+-   rmesa->state.color.clear = 0x00000000;
++   rmesa->radeon.state.color.clear = 0x00000000;
+ 
+    switch ( ctx->Visual.depthBits ) {
+    case 16:
+-      rmesa->state.depth.clear = 0x0000ffff;
+-      rmesa->state.depth.scale = 1.0 / (GLfloat)0xffff;
+-      depth_fmt = R200_DEPTH_FORMAT_16BIT_INT_Z;
+-      rmesa->state.stencil.clear = 0x00000000;
++      rmesa->radeon.state.depth.clear = 0x0000ffff;
++      rmesa->radeon.state.depth.scale = 1.0 / (GLfloat)0xffff;
++      rmesa->radeon.state.stencil.clear = 0x00000000;
+       break;
+    case 24:
+-      rmesa->state.depth.clear = 0x00ffffff;
+-      rmesa->state.depth.scale = 1.0 / (GLfloat)0xffffff;
+-      depth_fmt = R200_DEPTH_FORMAT_24BIT_INT_Z;
+-      rmesa->state.stencil.clear = 0xffff0000;
++      rmesa->radeon.state.depth.clear = 0x00ffffff;
++      rmesa->radeon.state.depth.scale = 1.0 / (GLfloat)0xffffff;
++      rmesa->radeon.state.stencil.clear = 0xffff0000;
+       break;
+    default:
+       fprintf( stderr, "Error: Unsupported depth %d... exiting\n",
+@@ -225,52 +633,37 @@ void r200InitState( r200ContextPtr rmesa )
+    }
+ 
+    /* Only have hw stencil when depth buffer is 24 bits deep */
+-   rmesa->state.stencil.hwBuffer = ( ctx->Visual.stencilBits > 0 &&
++   rmesa->radeon.state.stencil.hwBuffer = ( ctx->Visual.stencilBits > 0 &&
+ 				     ctx->Visual.depthBits == 24 );
+ 
+-   rmesa->Fallback = 0;
++   rmesa->radeon.Fallback = 0;
+ 
+-   if ( ctx->Visual.doubleBufferMode && rmesa->sarea->pfCurrentPage == 0 ) {
+-      drawOffset = rmesa->r200Screen->backOffset;
+-      drawPitch  = rmesa->r200Screen->backPitch;
+-   } else {
+-      drawOffset = rmesa->r200Screen->frontOffset;
+-      drawPitch  = rmesa->r200Screen->frontPitch;
+-   }
+-#if 000
+-   if ( ctx->Visual.doubleBufferMode && rmesa->sarea->pfCurrentPage == 0 ) {
+-      rmesa->state.color.drawOffset = rmesa->r200Screen->backOffset;
+-      rmesa->state.color.drawPitch  = rmesa->r200Screen->backPitch;
+-   } else {
+-      rmesa->state.color.drawOffset = rmesa->r200Screen->frontOffset;
+-      rmesa->state.color.drawPitch  = rmesa->r200Screen->frontPitch;
+-   }
+-
+-   rmesa->state.pixel.readOffset = rmesa->state.color.drawOffset;
+-   rmesa->state.pixel.readPitch  = rmesa->state.color.drawPitch;
+-#endif
+-
+-   rmesa->hw.max_state_size = 0;
++   rmesa->radeon.hw.max_state_size = 0;
+ 
+ #define ALLOC_STATE( ATOM, CHK, SZ, NM, IDX )				\
+    do {								\
+       rmesa->hw.ATOM.cmd_size = SZ;				\
+-      rmesa->hw.ATOM.cmd = (int *)CALLOC(SZ * sizeof(int));	\
+-      rmesa->hw.ATOM.lastcmd = (int *)CALLOC(SZ * sizeof(int));	\
++      rmesa->hw.ATOM.cmd = (GLuint *)CALLOC(SZ * sizeof(int));	\
++      rmesa->hw.ATOM.lastcmd = (GLuint *)CALLOC(SZ * sizeof(int));	\
+       rmesa->hw.ATOM.name = NM;					\
+       rmesa->hw.ATOM.idx = IDX;					\
+       rmesa->hw.ATOM.check = check_##CHK;			\
+       rmesa->hw.ATOM.dirty = GL_FALSE;				\
+-      rmesa->hw.max_state_size += SZ * sizeof(int);		\
++      rmesa->radeon.hw.max_state_size += SZ * sizeof(int);		\
+    } while (0)
+ 
+ 
+    /* Allocate state buffers:
+     */
+-   if (rmesa->r200Screen->drmSupportsBlendColor)
++   if (rmesa->radeon.radeonScreen->drmSupportsBlendColor)
+       ALLOC_STATE( ctx, always, CTX_STATE_SIZE_NEWDRM, "CTX/context", 0 );
+    else
+       ALLOC_STATE( ctx, always, CTX_STATE_SIZE_OLDDRM, "CTX/context", 0 );
++
++   if (rmesa->radeon.radeonScreen->kernel_mm)
++     rmesa->hw.ctx.emit = ctx_emit_cs;
++   else
++     rmesa->hw.ctx.emit = ctx_emit;
+    ALLOC_STATE( set, always, SET_STATE_SIZE, "SET/setup", 0 );
+    ALLOC_STATE( lin, always, LIN_STATE_SIZE, "LIN/line", 0 );
+    ALLOC_STATE( msk, always, MSK_STATE_SIZE, "MSK/mask", 0 );
+@@ -282,8 +675,8 @@ void r200InitState( r200ContextPtr rmesa )
+    ALLOC_STATE( cst, always, CST_STATE_SIZE, "CST/constant", 0 );
+    ALLOC_STATE( zbs, always, ZBS_STATE_SIZE, "ZBS/zbias", 0 );
+    ALLOC_STATE( tf, tf, TF_STATE_SIZE, "TF/tfactor", 0 );
+-   if (rmesa->r200Screen->drmSupportsFragShader) {
+-      if (rmesa->r200Screen->chip_family == CHIP_FAMILY_R200) {
++   if (rmesa->radeon.radeonScreen->drmSupportsFragShader) {
++      if (rmesa->radeon.radeonScreen->chip_family == CHIP_FAMILY_R200) {
+       /* make sure texture units 0/1 are emitted pair-wise for r200 t0 hang workaround */
+ 	 ALLOC_STATE( tex[0], tex_pair, TEX_STATE_SIZE_NEWDRM, "TEX/tex-0", 0 );
+ 	 ALLOC_STATE( tex[1], tex_pair, TEX_STATE_SIZE_NEWDRM, "TEX/tex-1", 1 );
+@@ -303,7 +696,7 @@ void r200InitState( r200ContextPtr rmesa )
+       ALLOC_STATE( afs[1], afs, AFS_STATE_SIZE, "AFS/afsinst-1", 1 );
+    }
+    else {
+-      if (rmesa->r200Screen->chip_family == CHIP_FAMILY_R200) {
++      if (rmesa->radeon.radeonScreen->chip_family == CHIP_FAMILY_R200) {
+ 	 ALLOC_STATE( tex[0], tex_pair, TEX_STATE_SIZE_OLDDRM, "TEX/tex-0", 0 );
+ 	 ALLOC_STATE( tex[1], tex_pair, TEX_STATE_SIZE_OLDDRM, "TEX/tex-1", 1 );
+ 	 ALLOC_STATE( tam, tex_any, TAM_STATE_SIZE, "TAM/tam", 0 );
+@@ -321,13 +714,18 @@ void r200InitState( r200ContextPtr rmesa )
+       ALLOC_STATE( afs[0], never, AFS_STATE_SIZE, "AFS/afsinst-0", 0 );
+       ALLOC_STATE( afs[1], never, AFS_STATE_SIZE, "AFS/afsinst-1", 1 );
+    }
+-   if (rmesa->r200Screen->drmSupportsCubeMapsR200) {
++
++   for (i = 0; i < 5; i++)
++     rmesa->hw.tex[i].emit = tex_emit;
++   if (rmesa->radeon.radeonScreen->drmSupportsCubeMapsR200) {
+       ALLOC_STATE( cube[0], tex_cube, CUBE_STATE_SIZE, "CUBE/tex-0", 0 );
+       ALLOC_STATE( cube[1], tex_cube, CUBE_STATE_SIZE, "CUBE/tex-1", 1 );
+       ALLOC_STATE( cube[2], tex_cube, CUBE_STATE_SIZE, "CUBE/tex-2", 2 );
+       ALLOC_STATE( cube[3], tex_cube, CUBE_STATE_SIZE, "CUBE/tex-3", 3 );
+       ALLOC_STATE( cube[4], tex_cube, CUBE_STATE_SIZE, "CUBE/tex-4", 4 );
+       ALLOC_STATE( cube[5], tex_cube, CUBE_STATE_SIZE, "CUBE/tex-5", 5 );
++      for (i = 0; i < 5; i++)
++	rmesa->hw.cube[i].emit = cube_emit;
+    }
+    else {
+       ALLOC_STATE( cube[0], never, CUBE_STATE_SIZE, "CUBE/tex-0", 0 );
+@@ -337,7 +735,8 @@ void r200InitState( r200ContextPtr rmesa )
+       ALLOC_STATE( cube[4], never, CUBE_STATE_SIZE, "CUBE/tex-4", 4 );
+       ALLOC_STATE( cube[5], never, CUBE_STATE_SIZE, "CUBE/tex-5", 5 );
+    }
+-   if (rmesa->r200Screen->drmSupportsVertexProgram) {
++
++   if (rmesa->radeon.radeonScreen->drmSupportsVertexProgram) {
+       ALLOC_STATE( pvs, tcl_vp, PVS_STATE_SIZE, "PVS/pvscntl", 0 );
+       ALLOC_STATE( vpi[0], tcl_vp, VPI_STATE_SIZE, "VP/vertexprog-0", 0 );
+       ALLOC_STATE( vpi[1], tcl_vp_size, VPI_STATE_SIZE, "VP/vertexprog-1", 1 );
+@@ -390,13 +789,13 @@ void r200InitState( r200ContextPtr rmesa )
+    ALLOC_STATE( pix[3], texenv, PIX_STATE_SIZE, "PIX/pixstage-3", 3 );
+    ALLOC_STATE( pix[4], texenv, PIX_STATE_SIZE, "PIX/pixstage-4", 4 );
+    ALLOC_STATE( pix[5], texenv, PIX_STATE_SIZE, "PIX/pixstage-5", 5 );
+-   if (rmesa->r200Screen->drmSupportsTriPerf) {
++   if (rmesa->radeon.radeonScreen->drmSupportsTriPerf) {
+       ALLOC_STATE( prf, always, PRF_STATE_SIZE, "PRF/performance-tri", 0 );
+    }
+    else {
+       ALLOC_STATE( prf, never, PRF_STATE_SIZE, "PRF/performance-tri", 0 );
+    }
+-   if (rmesa->r200Screen->drmSupportsPointSprites) {
++   if (rmesa->radeon.radeonScreen->drmSupportsPointSprites) {
+       ALLOC_STATE( spr, always, SPR_STATE_SIZE, "SPR/pointsprite", 0 );
+       ALLOC_STATE( ptp, tcl, PTP_STATE_SIZE, "PTP/pointparams", 0 );
+    }
+@@ -409,87 +808,115 @@ void r200InitState( r200ContextPtr rmesa )
+ 
+    /* Fill in the packet headers:
+     */
+-   rmesa->hw.ctx.cmd[CTX_CMD_0] = cmdpkt(RADEON_EMIT_PP_MISC);
+-   rmesa->hw.ctx.cmd[CTX_CMD_1] = cmdpkt(RADEON_EMIT_PP_CNTL);
+-   rmesa->hw.ctx.cmd[CTX_CMD_2] = cmdpkt(RADEON_EMIT_RB3D_COLORPITCH);
+-   if (rmesa->r200Screen->drmSupportsBlendColor)
+-      rmesa->hw.ctx.cmd[CTX_CMD_3] = cmdpkt(R200_EMIT_RB3D_BLENDCOLOR);
+-   rmesa->hw.lin.cmd[LIN_CMD_0] = cmdpkt(RADEON_EMIT_RE_LINE_PATTERN);
+-   rmesa->hw.lin.cmd[LIN_CMD_1] = cmdpkt(RADEON_EMIT_SE_LINE_WIDTH);
+-   rmesa->hw.msk.cmd[MSK_CMD_0] = cmdpkt(RADEON_EMIT_RB3D_STENCILREFMASK);
+-   rmesa->hw.vpt.cmd[VPT_CMD_0] = cmdpkt(RADEON_EMIT_SE_VPORT_XSCALE);
+-   rmesa->hw.set.cmd[SET_CMD_0] = cmdpkt(RADEON_EMIT_SE_CNTL);
+-   rmesa->hw.msc.cmd[MSC_CMD_0] = cmdpkt(RADEON_EMIT_RE_MISC);
+-   rmesa->hw.cst.cmd[CST_CMD_0] = cmdpkt(R200_EMIT_PP_CNTL_X);
+-   rmesa->hw.cst.cmd[CST_CMD_1] = cmdpkt(R200_EMIT_RB3D_DEPTHXY_OFFSET);
+-   rmesa->hw.cst.cmd[CST_CMD_2] = cmdpkt(R200_EMIT_RE_AUX_SCISSOR_CNTL);
+-   rmesa->hw.cst.cmd[CST_CMD_3] = cmdpkt(R200_EMIT_RE_SCISSOR_TL_0);
+-   rmesa->hw.cst.cmd[CST_CMD_4] = cmdpkt(R200_EMIT_SE_VAP_CNTL_STATUS);
+-   rmesa->hw.cst.cmd[CST_CMD_5] = cmdpkt(R200_EMIT_RE_POINTSIZE);
+-   rmesa->hw.cst.cmd[CST_CMD_6] = cmdpkt(R200_EMIT_TCL_INPUT_VTX_VECTOR_ADDR_0);
+-   rmesa->hw.tam.cmd[TAM_CMD_0] = cmdpkt(R200_EMIT_PP_TAM_DEBUG3);
+-   rmesa->hw.tf.cmd[TF_CMD_0] = cmdpkt(R200_EMIT_TFACTOR_0);
+-   if (rmesa->r200Screen->drmSupportsFragShader) {
+-      rmesa->hw.atf.cmd[ATF_CMD_0] = cmdpkt(R200_EMIT_ATF_TFACTOR);
+-      rmesa->hw.tex[0].cmd[TEX_CMD_0] = cmdpkt(R200_EMIT_PP_TXCTLALL_0);
+-      rmesa->hw.tex[0].cmd[TEX_CMD_1_NEWDRM] = cmdpkt(R200_EMIT_PP_TXOFFSET_0);
+-      rmesa->hw.tex[1].cmd[TEX_CMD_0] = cmdpkt(R200_EMIT_PP_TXCTLALL_1);
+-      rmesa->hw.tex[1].cmd[TEX_CMD_1_NEWDRM] = cmdpkt(R200_EMIT_PP_TXOFFSET_1);
+-      rmesa->hw.tex[2].cmd[TEX_CMD_0] = cmdpkt(R200_EMIT_PP_TXCTLALL_2);
+-      rmesa->hw.tex[2].cmd[TEX_CMD_1_NEWDRM] = cmdpkt(R200_EMIT_PP_TXOFFSET_2);
+-      rmesa->hw.tex[3].cmd[TEX_CMD_0] = cmdpkt(R200_EMIT_PP_TXCTLALL_3);
+-      rmesa->hw.tex[3].cmd[TEX_CMD_1_NEWDRM] = cmdpkt(R200_EMIT_PP_TXOFFSET_3);
+-      rmesa->hw.tex[4].cmd[TEX_CMD_0] = cmdpkt(R200_EMIT_PP_TXCTLALL_4);
+-      rmesa->hw.tex[4].cmd[TEX_CMD_1_NEWDRM] = cmdpkt(R200_EMIT_PP_TXOFFSET_4);
+-      rmesa->hw.tex[5].cmd[TEX_CMD_0] = cmdpkt(R200_EMIT_PP_TXCTLALL_5);
+-      rmesa->hw.tex[5].cmd[TEX_CMD_1_NEWDRM] = cmdpkt(R200_EMIT_PP_TXOFFSET_5);
++   rmesa->hw.ctx.cmd[CTX_CMD_0] = cmdpkt(rmesa, RADEON_EMIT_PP_MISC);
++   rmesa->hw.ctx.cmd[CTX_CMD_1] = cmdpkt(rmesa, RADEON_EMIT_PP_CNTL);
++   rmesa->hw.ctx.cmd[CTX_CMD_2] = cmdpkt(rmesa, RADEON_EMIT_RB3D_COLORPITCH);
++   if (rmesa->radeon.radeonScreen->drmSupportsBlendColor)
++      rmesa->hw.ctx.cmd[CTX_CMD_3] = cmdpkt(rmesa, R200_EMIT_RB3D_BLENDCOLOR);
++   rmesa->hw.lin.cmd[LIN_CMD_0] = cmdpkt(rmesa, RADEON_EMIT_RE_LINE_PATTERN);
++   rmesa->hw.lin.cmd[LIN_CMD_1] = cmdpkt(rmesa, RADEON_EMIT_SE_LINE_WIDTH);
++   rmesa->hw.msk.cmd[MSK_CMD_0] = cmdpkt(rmesa, RADEON_EMIT_RB3D_STENCILREFMASK);
++   rmesa->hw.vpt.cmd[VPT_CMD_0] = cmdpkt(rmesa, RADEON_EMIT_SE_VPORT_XSCALE);
++   rmesa->hw.set.cmd[SET_CMD_0] = cmdpkt(rmesa, RADEON_EMIT_SE_CNTL);
++   rmesa->hw.msc.cmd[MSC_CMD_0] = cmdpkt(rmesa, RADEON_EMIT_RE_MISC);
++   rmesa->hw.cst.cmd[CST_CMD_0] = cmdpkt(rmesa, R200_EMIT_PP_CNTL_X);
++   rmesa->hw.cst.cmd[CST_CMD_1] = cmdpkt(rmesa, R200_EMIT_RB3D_DEPTHXY_OFFSET);
++   rmesa->hw.cst.cmd[CST_CMD_2] = cmdpkt(rmesa, R200_EMIT_RE_AUX_SCISSOR_CNTL);
++   rmesa->hw.cst.cmd[CST_CMD_3] = cmdpkt(rmesa, R200_EMIT_RE_SCISSOR_TL_0);
++   rmesa->hw.cst.cmd[CST_CMD_4] = cmdpkt(rmesa, R200_EMIT_SE_VAP_CNTL_STATUS);
++   rmesa->hw.cst.cmd[CST_CMD_5] = cmdpkt(rmesa, R200_EMIT_RE_POINTSIZE);
++   rmesa->hw.cst.cmd[CST_CMD_6] = cmdpkt(rmesa, R200_EMIT_TCL_INPUT_VTX_VECTOR_ADDR_0);
++   rmesa->hw.tam.cmd[TAM_CMD_0] = cmdpkt(rmesa, R200_EMIT_PP_TAM_DEBUG3);
++   rmesa->hw.tf.cmd[TF_CMD_0] = cmdpkt(rmesa, R200_EMIT_TFACTOR_0);
++   if (rmesa->radeon.radeonScreen->drmSupportsFragShader) {
++      rmesa->hw.atf.cmd[ATF_CMD_0] = cmdpkt(rmesa, R200_EMIT_ATF_TFACTOR);
++      rmesa->hw.tex[0].cmd[TEX_CMD_0] = cmdpkt(rmesa, R200_EMIT_PP_TXCTLALL_0);
++      rmesa->hw.tex[0].cmd[TEX_CMD_1_NEWDRM] = cmdpkt(rmesa, R200_EMIT_PP_TXOFFSET_0);
++      rmesa->hw.tex[1].cmd[TEX_CMD_0] = cmdpkt(rmesa, R200_EMIT_PP_TXCTLALL_1);
++      rmesa->hw.tex[1].cmd[TEX_CMD_1_NEWDRM] = cmdpkt(rmesa, R200_EMIT_PP_TXOFFSET_1);
++      rmesa->hw.tex[2].cmd[TEX_CMD_0] = cmdpkt(rmesa, R200_EMIT_PP_TXCTLALL_2);
++      rmesa->hw.tex[2].cmd[TEX_CMD_1_NEWDRM] = cmdpkt(rmesa, R200_EMIT_PP_TXOFFSET_2);
++      rmesa->hw.tex[3].cmd[TEX_CMD_0] = cmdpkt(rmesa, R200_EMIT_PP_TXCTLALL_3);
++      rmesa->hw.tex[3].cmd[TEX_CMD_1_NEWDRM] = cmdpkt(rmesa, R200_EMIT_PP_TXOFFSET_3);
++      rmesa->hw.tex[4].cmd[TEX_CMD_0] = cmdpkt(rmesa, R200_EMIT_PP_TXCTLALL_4);
++      rmesa->hw.tex[4].cmd[TEX_CMD_1_NEWDRM] = cmdpkt(rmesa, R200_EMIT_PP_TXOFFSET_4);
++      rmesa->hw.tex[5].cmd[TEX_CMD_0] = cmdpkt(rmesa, R200_EMIT_PP_TXCTLALL_5);
++      rmesa->hw.tex[5].cmd[TEX_CMD_1_NEWDRM] = cmdpkt(rmesa, R200_EMIT_PP_TXOFFSET_5);
+    } else {
+-      rmesa->hw.tex[0].cmd[TEX_CMD_0] = cmdpkt(R200_EMIT_PP_TXFILTER_0);
+-      rmesa->hw.tex[0].cmd[TEX_CMD_1_OLDDRM] = cmdpkt(R200_EMIT_PP_TXOFFSET_0);
+-      rmesa->hw.tex[1].cmd[TEX_CMD_0] = cmdpkt(R200_EMIT_PP_TXFILTER_1);
+-      rmesa->hw.tex[1].cmd[TEX_CMD_1_OLDDRM] = cmdpkt(R200_EMIT_PP_TXOFFSET_1);
+-      rmesa->hw.tex[2].cmd[TEX_CMD_0] = cmdpkt(R200_EMIT_PP_TXFILTER_2);
+-      rmesa->hw.tex[2].cmd[TEX_CMD_1_OLDDRM] = cmdpkt(R200_EMIT_PP_TXOFFSET_2);
+-      rmesa->hw.tex[3].cmd[TEX_CMD_0] = cmdpkt(R200_EMIT_PP_TXFILTER_3);
+-      rmesa->hw.tex[3].cmd[TEX_CMD_1_OLDDRM] = cmdpkt(R200_EMIT_PP_TXOFFSET_3);
+-      rmesa->hw.tex[4].cmd[TEX_CMD_0] = cmdpkt(R200_EMIT_PP_TXFILTER_4);
+-      rmesa->hw.tex[4].cmd[TEX_CMD_1_OLDDRM] = cmdpkt(R200_EMIT_PP_TXOFFSET_4);
+-      rmesa->hw.tex[5].cmd[TEX_CMD_0] = cmdpkt(R200_EMIT_PP_TXFILTER_5);
+-      rmesa->hw.tex[5].cmd[TEX_CMD_1_OLDDRM] = cmdpkt(R200_EMIT_PP_TXOFFSET_5);
+-   }
+-   rmesa->hw.afs[0].cmd[AFS_CMD_0] = cmdpkt(R200_EMIT_PP_AFS_0);
+-   rmesa->hw.afs[1].cmd[AFS_CMD_0] = cmdpkt(R200_EMIT_PP_AFS_1);
+-   rmesa->hw.pvs.cmd[PVS_CMD_0] = cmdpkt(R200_EMIT_VAP_PVS_CNTL);
+-   rmesa->hw.cube[0].cmd[CUBE_CMD_0] = cmdpkt(R200_EMIT_PP_CUBIC_FACES_0);
+-   rmesa->hw.cube[0].cmd[CUBE_CMD_1] = cmdpkt(R200_EMIT_PP_CUBIC_OFFSETS_0);
+-   rmesa->hw.cube[1].cmd[CUBE_CMD_0] = cmdpkt(R200_EMIT_PP_CUBIC_FACES_1);
+-   rmesa->hw.cube[1].cmd[CUBE_CMD_1] = cmdpkt(R200_EMIT_PP_CUBIC_OFFSETS_1);
+-   rmesa->hw.cube[2].cmd[CUBE_CMD_0] = cmdpkt(R200_EMIT_PP_CUBIC_FACES_2);
+-   rmesa->hw.cube[2].cmd[CUBE_CMD_1] = cmdpkt(R200_EMIT_PP_CUBIC_OFFSETS_2);
+-   rmesa->hw.cube[3].cmd[CUBE_CMD_0] = cmdpkt(R200_EMIT_PP_CUBIC_FACES_3);
+-   rmesa->hw.cube[3].cmd[CUBE_CMD_1] = cmdpkt(R200_EMIT_PP_CUBIC_OFFSETS_3);
+-   rmesa->hw.cube[4].cmd[CUBE_CMD_0] = cmdpkt(R200_EMIT_PP_CUBIC_FACES_4);
+-   rmesa->hw.cube[4].cmd[CUBE_CMD_1] = cmdpkt(R200_EMIT_PP_CUBIC_OFFSETS_4);
+-   rmesa->hw.cube[5].cmd[CUBE_CMD_0] = cmdpkt(R200_EMIT_PP_CUBIC_FACES_5);
+-   rmesa->hw.cube[5].cmd[CUBE_CMD_1] = cmdpkt(R200_EMIT_PP_CUBIC_OFFSETS_5);
+-   rmesa->hw.pix[0].cmd[PIX_CMD_0] = cmdpkt(R200_EMIT_PP_TXCBLEND_0);
+-   rmesa->hw.pix[1].cmd[PIX_CMD_0] = cmdpkt(R200_EMIT_PP_TXCBLEND_1);
+-   rmesa->hw.pix[2].cmd[PIX_CMD_0] = cmdpkt(R200_EMIT_PP_TXCBLEND_2);
+-   rmesa->hw.pix[3].cmd[PIX_CMD_0] = cmdpkt(R200_EMIT_PP_TXCBLEND_3);
+-   rmesa->hw.pix[4].cmd[PIX_CMD_0] = cmdpkt(R200_EMIT_PP_TXCBLEND_4);
+-   rmesa->hw.pix[5].cmd[PIX_CMD_0] = cmdpkt(R200_EMIT_PP_TXCBLEND_5);
+-   rmesa->hw.zbs.cmd[ZBS_CMD_0] = cmdpkt(RADEON_EMIT_SE_ZBIAS_FACTOR);
+-   rmesa->hw.tcl.cmd[TCL_CMD_0] = cmdpkt(R200_EMIT_TCL_LIGHT_MODEL_CTL_0);
+-   rmesa->hw.tcl.cmd[TCL_CMD_1] = cmdpkt(R200_EMIT_TCL_UCP_VERT_BLEND_CTL);
+-   rmesa->hw.tcg.cmd[TCG_CMD_0] = cmdpkt(R200_EMIT_TEX_PROC_CTL_2);
+-   rmesa->hw.msl.cmd[MSL_CMD_0] = cmdpkt(R200_EMIT_MATRIX_SELECT_0);
+-   rmesa->hw.vap.cmd[VAP_CMD_0] = cmdpkt(R200_EMIT_VAP_CTL);
+-   rmesa->hw.vtx.cmd[VTX_CMD_0] = cmdpkt(R200_EMIT_VTX_FMT_0);
+-   rmesa->hw.vtx.cmd[VTX_CMD_1] = cmdpkt(R200_EMIT_OUTPUT_VTX_COMP_SEL);
+-   rmesa->hw.vtx.cmd[VTX_CMD_2] = cmdpkt(R200_EMIT_SE_VTX_STATE_CNTL);
+-   rmesa->hw.vte.cmd[VTE_CMD_0] = cmdpkt(R200_EMIT_VTE_CNTL);
+-   rmesa->hw.prf.cmd[PRF_CMD_0] = cmdpkt(R200_EMIT_PP_TRI_PERF_CNTL);
+-   rmesa->hw.spr.cmd[SPR_CMD_0] = cmdpkt(R200_EMIT_TCL_POINT_SPRITE_CNTL);
++      rmesa->hw.tex[0].cmd[TEX_CMD_0] = cmdpkt(rmesa, R200_EMIT_PP_TXFILTER_0);
++      rmesa->hw.tex[0].cmd[TEX_CMD_1_OLDDRM] = cmdpkt(rmesa, R200_EMIT_PP_TXOFFSET_0);
++      rmesa->hw.tex[1].cmd[TEX_CMD_0] = cmdpkt(rmesa, R200_EMIT_PP_TXFILTER_1);
++      rmesa->hw.tex[1].cmd[TEX_CMD_1_OLDDRM] = cmdpkt(rmesa, R200_EMIT_PP_TXOFFSET_1);
++      rmesa->hw.tex[2].cmd[TEX_CMD_0] = cmdpkt(rmesa, R200_EMIT_PP_TXFILTER_2);
++      rmesa->hw.tex[2].cmd[TEX_CMD_1_OLDDRM] = cmdpkt(rmesa, R200_EMIT_PP_TXOFFSET_2);
++      rmesa->hw.tex[3].cmd[TEX_CMD_0] = cmdpkt(rmesa, R200_EMIT_PP_TXFILTER_3);
++      rmesa->hw.tex[3].cmd[TEX_CMD_1_OLDDRM] = cmdpkt(rmesa, R200_EMIT_PP_TXOFFSET_3);
++      rmesa->hw.tex[4].cmd[TEX_CMD_0] = cmdpkt(rmesa, R200_EMIT_PP_TXFILTER_4);
++      rmesa->hw.tex[4].cmd[TEX_CMD_1_OLDDRM] = cmdpkt(rmesa, R200_EMIT_PP_TXOFFSET_4);
++      rmesa->hw.tex[5].cmd[TEX_CMD_0] = cmdpkt(rmesa, R200_EMIT_PP_TXFILTER_5);
++      rmesa->hw.tex[5].cmd[TEX_CMD_1_OLDDRM] = cmdpkt(rmesa, R200_EMIT_PP_TXOFFSET_5);
++   }
++   rmesa->hw.afs[0].cmd[AFS_CMD_0] = cmdpkt(rmesa, R200_EMIT_PP_AFS_0);
++   rmesa->hw.afs[1].cmd[AFS_CMD_0] = cmdpkt(rmesa, R200_EMIT_PP_AFS_1);
++   rmesa->hw.pvs.cmd[PVS_CMD_0] = cmdpkt(rmesa, R200_EMIT_VAP_PVS_CNTL);
++   rmesa->hw.cube[0].cmd[CUBE_CMD_0] = cmdpkt(rmesa, R200_EMIT_PP_CUBIC_FACES_0);
++   rmesa->hw.cube[0].cmd[CUBE_CMD_1] = cmdpkt(rmesa, R200_EMIT_PP_CUBIC_OFFSETS_0);
++   rmesa->hw.cube[1].cmd[CUBE_CMD_0] = cmdpkt(rmesa, R200_EMIT_PP_CUBIC_FACES_1);
++   rmesa->hw.cube[1].cmd[CUBE_CMD_1] = cmdpkt(rmesa, R200_EMIT_PP_CUBIC_OFFSETS_1);
++   rmesa->hw.cube[2].cmd[CUBE_CMD_0] = cmdpkt(rmesa, R200_EMIT_PP_CUBIC_FACES_2);
++   rmesa->hw.cube[2].cmd[CUBE_CMD_1] = cmdpkt(rmesa, R200_EMIT_PP_CUBIC_OFFSETS_2);
++   rmesa->hw.cube[3].cmd[CUBE_CMD_0] = cmdpkt(rmesa, R200_EMIT_PP_CUBIC_FACES_3);
++   rmesa->hw.cube[3].cmd[CUBE_CMD_1] = cmdpkt(rmesa, R200_EMIT_PP_CUBIC_OFFSETS_3);
++   rmesa->hw.cube[4].cmd[CUBE_CMD_0] = cmdpkt(rmesa, R200_EMIT_PP_CUBIC_FACES_4);
++   rmesa->hw.cube[4].cmd[CUBE_CMD_1] = cmdpkt(rmesa, R200_EMIT_PP_CUBIC_OFFSETS_4);
++   rmesa->hw.cube[5].cmd[CUBE_CMD_0] = cmdpkt(rmesa, R200_EMIT_PP_CUBIC_FACES_5);
++   rmesa->hw.cube[5].cmd[CUBE_CMD_1] = cmdpkt(rmesa, R200_EMIT_PP_CUBIC_OFFSETS_5);
++   rmesa->hw.pix[0].cmd[PIX_CMD_0] = cmdpkt(rmesa, R200_EMIT_PP_TXCBLEND_0);
++   rmesa->hw.pix[1].cmd[PIX_CMD_0] = cmdpkt(rmesa, R200_EMIT_PP_TXCBLEND_1);
++   rmesa->hw.pix[2].cmd[PIX_CMD_0] = cmdpkt(rmesa, R200_EMIT_PP_TXCBLEND_2);
++   rmesa->hw.pix[3].cmd[PIX_CMD_0] = cmdpkt(rmesa, R200_EMIT_PP_TXCBLEND_3);
++   rmesa->hw.pix[4].cmd[PIX_CMD_0] = cmdpkt(rmesa, R200_EMIT_PP_TXCBLEND_4);
++   rmesa->hw.pix[5].cmd[PIX_CMD_0] = cmdpkt(rmesa, R200_EMIT_PP_TXCBLEND_5);
++   rmesa->hw.zbs.cmd[ZBS_CMD_0] = cmdpkt(rmesa, RADEON_EMIT_SE_ZBIAS_FACTOR);
++   rmesa->hw.tcl.cmd[TCL_CMD_0] = cmdpkt(rmesa, R200_EMIT_TCL_LIGHT_MODEL_CTL_0);
++   rmesa->hw.tcl.cmd[TCL_CMD_1] = cmdpkt(rmesa, R200_EMIT_TCL_UCP_VERT_BLEND_CTL);
++   rmesa->hw.tcg.cmd[TCG_CMD_0] = cmdpkt(rmesa, R200_EMIT_TEX_PROC_CTL_2);
++   rmesa->hw.msl.cmd[MSL_CMD_0] = cmdpkt(rmesa, R200_EMIT_MATRIX_SELECT_0);
++   rmesa->hw.vap.cmd[VAP_CMD_0] = cmdpkt(rmesa, R200_EMIT_VAP_CTL);
++   rmesa->hw.vtx.cmd[VTX_CMD_0] = cmdpkt(rmesa, R200_EMIT_VTX_FMT_0);
++   rmesa->hw.vtx.cmd[VTX_CMD_1] = cmdpkt(rmesa, R200_EMIT_OUTPUT_VTX_COMP_SEL);
++   rmesa->hw.vtx.cmd[VTX_CMD_2] = cmdpkt(rmesa, R200_EMIT_SE_VTX_STATE_CNTL);
++   rmesa->hw.vte.cmd[VTE_CMD_0] = cmdpkt(rmesa, R200_EMIT_VTE_CNTL);
++   rmesa->hw.prf.cmd[PRF_CMD_0] = cmdpkt(rmesa, R200_EMIT_PP_TRI_PERF_CNTL);
++   rmesa->hw.spr.cmd[SPR_CMD_0] = cmdpkt(rmesa, R200_EMIT_TCL_POINT_SPRITE_CNTL);
++   if (rmesa->radeon.radeonScreen->kernel_mm) {
++	rmesa->hw.mtl[0].emit = mtl_emit;
++	rmesa->hw.mtl[1].emit = mtl_emit;
++
++	rmesa->hw.vpi[0].emit = veclinear_emit;
++	rmesa->hw.vpi[1].emit = veclinear_emit;
++	rmesa->hw.vpp[0].emit = veclinear_emit;
++	rmesa->hw.vpp[1].emit = veclinear_emit;
++
++	rmesa->hw.grd.emit = scl_emit;
++	rmesa->hw.fog.emit = vec_emit;
++	rmesa->hw.glt.emit = vec_emit;
++	rmesa->hw.eye.emit = vec_emit;
++
++	for (i = R200_MTX_MV; i <= R200_MTX_TEX5; i++)
++	  rmesa->hw.mat[i].emit = vec_emit;
++
++	for (i = 0; i < 8; i++)
++	  rmesa->hw.lit[i].emit = lit_emit;
++
++	for (i = 0; i < 6; i++)
++	  rmesa->hw.ucp[i].emit = vec_emit;
++
++	rmesa->hw.ptp.emit = ptp_emit;
++   }
 +
-+static void legacy_kick_all_buffers(struct bo_manager_legacy *boml)
-+{
-+    struct bo_legacy *legacy;
 +
-+    legacy = boml->bos.next;
-+    while (legacy != &boml->bos) {
-+	if (legacy->tobj) {
-+	    if (legacy->validated) {
-+		driDestroyTextureObject(&legacy->tobj->base);
-+		legacy->tobj = 0;
-+		legacy->validated = 0;
-+	    }
-+	}
-+	legacy = legacy->next;
-+    }
-+}
++   
+    rmesa->hw.mtl[0].cmd[MTL_CMD_0] = 
+       cmdvec( R200_VS_MAT_0_EMISS, 1, 16 );
+    rmesa->hw.mtl[0].cmd[MTL_CMD_1] = 
+@@ -567,7 +994,7 @@ void r200InitState( r200ContextPtr rmesa )
+ 				(R200_BLEND_GL_ONE << R200_SRC_BLEND_SHIFT) |
+ 				(R200_BLEND_GL_ZERO << R200_DST_BLEND_SHIFT));
+ 
+-   if (rmesa->r200Screen->drmSupportsBlendColor) {
++   if (rmesa->radeon.radeonScreen->drmSupportsBlendColor) {
+       rmesa->hw.ctx.cmd[CTX_RB3D_BLENDCOLOR] = 0x00000000;
+       rmesa->hw.ctx.cmd[CTX_RB3D_ABLENDCNTL] = (R200_COMB_FCN_ADD_CLAMP |
+ 				(R200_BLEND_GL_ONE << R200_SRC_BLEND_SHIFT) |
+@@ -578,18 +1005,17 @@ void r200InitState( r200ContextPtr rmesa )
+    }
+ 
+    rmesa->hw.ctx.cmd[CTX_RB3D_DEPTHOFFSET] =
+-      rmesa->r200Screen->depthOffset + rmesa->r200Screen->fbLocation;
++      rmesa->radeon.radeonScreen->depthOffset + rmesa->radeon.radeonScreen->fbLocation;
+ 
+    rmesa->hw.ctx.cmd[CTX_RB3D_DEPTHPITCH] = 
+-      ((rmesa->r200Screen->depthPitch &
++      ((rmesa->radeon.radeonScreen->depthPitch &
+ 	R200_DEPTHPITCH_MASK) |
+        R200_DEPTH_ENDIAN_NO_SWAP);
+    
+    if (rmesa->using_hyperz)
+       rmesa->hw.ctx.cmd[CTX_RB3D_DEPTHPITCH] |= R200_DEPTH_HYPERZ;
+ 
+-   rmesa->hw.ctx.cmd[CTX_RB3D_ZSTENCILCNTL] = (depth_fmt |
+-					       R200_Z_TEST_LESS |
++   rmesa->hw.ctx.cmd[CTX_RB3D_ZSTENCILCNTL] = (R200_Z_TEST_LESS |
+ 					       R200_STENCIL_TEST_ALWAYS |
+ 					       R200_STENCIL_FAIL_KEEP |
+ 					       R200_STENCIL_ZPASS_KEEP |
+@@ -599,15 +1025,14 @@ void r200InitState( r200ContextPtr rmesa )
+    if (rmesa->using_hyperz) {
+       rmesa->hw.ctx.cmd[CTX_RB3D_ZSTENCILCNTL] |= R200_Z_COMPRESSION_ENABLE |
+ 						  R200_Z_DECOMPRESSION_ENABLE;
+-/*      if (rmesa->r200Screen->chip_family == CHIP_FAMILY_R200)
++/*      if (rmesa->radeon.radeonScreen->chip_family == CHIP_FAMILY_R200)
+ 	 rmesa->hw.ctx.cmd[CTX_RB3D_ZSTENCILCNTL] |= RADEON_Z_HIERARCHY_ENABLE;*/
+    }
+ 
+    rmesa->hw.ctx.cmd[CTX_PP_CNTL] = (R200_ANTI_ALIAS_NONE 
+  				     | R200_TEX_BLEND_0_ENABLE);
+ 
+-   rmesa->hw.ctx.cmd[CTX_RB3D_CNTL] = color_fmt;
+-   switch ( driQueryOptioni( &rmesa->optionCache, "dither_mode" ) ) {
++   switch ( driQueryOptioni( &rmesa->radeon.optionCache, "dither_mode" ) ) {
+    case DRI_CONF_DITHER_XERRORDIFFRESET:
+       rmesa->hw.ctx.cmd[CTX_RB3D_CNTL] |= R200_DITHER_INIT;
+       break;
+@@ -615,41 +1040,19 @@ void r200InitState( r200ContextPtr rmesa )
+       rmesa->hw.ctx.cmd[CTX_RB3D_CNTL] |= R200_SCALE_DITHER_ENABLE;
+       break;
+    }
+-   if ( driQueryOptioni( &rmesa->optionCache, "round_mode" ) ==
++   if ( driQueryOptioni( &rmesa->radeon.optionCache, "round_mode" ) ==
+ 	DRI_CONF_ROUND_ROUND )
+-      rmesa->state.color.roundEnable = R200_ROUND_ENABLE;
++      rmesa->radeon.state.color.roundEnable = R200_ROUND_ENABLE;
+    else
+-      rmesa->state.color.roundEnable = 0;
+-   if ( driQueryOptioni (&rmesa->optionCache, "color_reduction" ) ==
++      rmesa->radeon.state.color.roundEnable = 0;
++   if ( driQueryOptioni (&rmesa->radeon.optionCache, "color_reduction" ) ==
+ 	DRI_CONF_COLOR_REDUCTION_DITHER )
+       rmesa->hw.ctx.cmd[CTX_RB3D_CNTL] |= R200_DITHER_ENABLE;
+    else
+-      rmesa->hw.ctx.cmd[CTX_RB3D_CNTL] |= rmesa->state.color.roundEnable;
+-
+-#if 000
+-   rmesa->hw.ctx.cmd[CTX_RB3D_COLOROFFSET] = ((rmesa->state.color.drawOffset +
+-					       rmesa->r200Screen->fbLocation)
+-					      & R200_COLOROFFSET_MASK);
+-
+-   rmesa->hw.ctx.cmd[CTX_RB3D_COLORPITCH] = ((rmesa->state.color.drawPitch &
+-					      R200_COLORPITCH_MASK) |
+-					     R200_COLOR_ENDIAN_NO_SWAP);
+-#else
+-   rmesa->hw.ctx.cmd[CTX_RB3D_COLOROFFSET] = ((drawOffset +
+-					       rmesa->r200Screen->fbLocation)
+-					      & R200_COLOROFFSET_MASK);
+-
+-   rmesa->hw.ctx.cmd[CTX_RB3D_COLORPITCH] = ((drawPitch &
+-					      R200_COLORPITCH_MASK) |
+-					     R200_COLOR_ENDIAN_NO_SWAP);
+-#endif
+-   /* (fixed size) sarea is initialized to zero afaics so can omit version check. Phew! */
+-   if (rmesa->sarea->tiling_enabled) {
+-      rmesa->hw.ctx.cmd[CTX_RB3D_COLORPITCH] |= R200_COLOR_TILE_ENABLE;
+-   }
++      rmesa->hw.ctx.cmd[CTX_RB3D_CNTL] |= rmesa->radeon.state.color.roundEnable;
+ 
+    rmesa->hw.prf.cmd[PRF_PP_TRI_PERF] = R200_TRI_CUTOFF_MASK - R200_TRI_CUTOFF_MASK * 
+-			driQueryOptionf (&rmesa->optionCache,"texture_blend_quality");
++			driQueryOptionf (&rmesa->radeon.optionCache,"texture_blend_quality");
+    rmesa->hw.prf.cmd[PRF_PP_PERF_CNTL] = 0;
+ 
+    rmesa->hw.set.cmd[SET_SE_CNTL] = (R200_FFACE_CULL_CCW |
+@@ -704,7 +1107,7 @@ void r200InitState( r200ContextPtr rmesa )
+ 						R200_VC_NO_SWAP;
+ #endif
+ 
+-   if (!(rmesa->r200Screen->chip_flags & RADEON_CHIPSET_TCL)) {
++   if (!(rmesa->radeon.radeonScreen->chip_flags & RADEON_CHIPSET_TCL)) {
+       /* Bypass TCL */
+       rmesa->hw.cst.cmd[CST_SE_VAP_CNTL_STATUS] |= (1<<8);
+    }
+@@ -743,28 +1146,28 @@ void r200InitState( r200ContextPtr rmesa )
+       rmesa->hw.tex[i].cmd[TEX_PP_TXFORMAT_X] =
+          (/* R200_TEXCOORD_PROJ | */
+           0x100000);	/* Small default bias */
+-      if (rmesa->r200Screen->drmSupportsFragShader) {
++      if (rmesa->radeon.radeonScreen->drmSupportsFragShader) {
+ 	 rmesa->hw.tex[i].cmd[TEX_PP_TXOFFSET_NEWDRM] =
+-	     rmesa->r200Screen->texOffset[RADEON_LOCAL_TEX_HEAP];
++	     rmesa->radeon.radeonScreen->texOffset[RADEON_LOCAL_TEX_HEAP];
+ 	 rmesa->hw.tex[i].cmd[TEX_PP_CUBIC_FACES] = 0;
+ 	 rmesa->hw.tex[i].cmd[TEX_PP_TXMULTI_CTL] = 0;
+       }
+       else {
+ 	  rmesa->hw.tex[i].cmd[TEX_PP_TXOFFSET_OLDDRM] =
+-	     rmesa->r200Screen->texOffset[RADEON_LOCAL_TEX_HEAP];
++	     rmesa->radeon.radeonScreen->texOffset[RADEON_LOCAL_TEX_HEAP];
+      }
+ 
+       rmesa->hw.cube[i].cmd[CUBE_PP_CUBIC_FACES] = 0;
+       rmesa->hw.cube[i].cmd[CUBE_PP_CUBIC_OFFSET_F1] =
+-         rmesa->r200Screen->texOffset[RADEON_LOCAL_TEX_HEAP];
++         rmesa->radeon.radeonScreen->texOffset[RADEON_LOCAL_TEX_HEAP];
+       rmesa->hw.cube[i].cmd[CUBE_PP_CUBIC_OFFSET_F2] =
+-         rmesa->r200Screen->texOffset[RADEON_LOCAL_TEX_HEAP];
++         rmesa->radeon.radeonScreen->texOffset[RADEON_LOCAL_TEX_HEAP];
+       rmesa->hw.cube[i].cmd[CUBE_PP_CUBIC_OFFSET_F3] =
+-         rmesa->r200Screen->texOffset[RADEON_LOCAL_TEX_HEAP];
++         rmesa->radeon.radeonScreen->texOffset[RADEON_LOCAL_TEX_HEAP];
+       rmesa->hw.cube[i].cmd[CUBE_PP_CUBIC_OFFSET_F4] =
+-         rmesa->r200Screen->texOffset[RADEON_LOCAL_TEX_HEAP];
++         rmesa->radeon.radeonScreen->texOffset[RADEON_LOCAL_TEX_HEAP];
+       rmesa->hw.cube[i].cmd[CUBE_PP_CUBIC_OFFSET_F5] =
+-         rmesa->r200Screen->texOffset[RADEON_LOCAL_TEX_HEAP];
++         rmesa->radeon.radeonScreen->texOffset[RADEON_LOCAL_TEX_HEAP];
+ 
+       rmesa->hw.pix[i].cmd[PIX_PP_TXCBLEND] =
+          (R200_TXC_ARG_A_ZERO |
+@@ -967,5 +1370,7 @@ void r200InitState( r200ContextPtr rmesa )
+ 
+    r200LightingSpaceChange( ctx );
+ 
+-   rmesa->hw.all_dirty = GL_TRUE;
++   rmesa->radeon.hw.all_dirty = GL_TRUE;
++
++   rcommonInitCmdBuf(&rmesa->radeon);
+ }
+diff --git a/src/mesa/drivers/dri/r200/r200_swtcl.c b/src/mesa/drivers/dri/r200/r200_swtcl.c
+index b25f028..b006409 100644
+--- a/src/mesa/drivers/dri/r200/r200_swtcl.c
++++ b/src/mesa/drivers/dri/r200/r200_swtcl.c
+@@ -55,27 +55,24 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ #include "r200_tcl.h"
+ 
+ 
+-static void flush_last_swtcl_prim( r200ContextPtr rmesa  );
+-
+-
+ /***********************************************************************
+  *                         Initialization 
+  ***********************************************************************/
+ 
+ #define EMIT_ATTR( ATTR, STYLE, F0 )					\
+ do {									\
+-   rmesa->swtcl.vertex_attrs[rmesa->swtcl.vertex_attr_count].attrib = (ATTR);	\
+-   rmesa->swtcl.vertex_attrs[rmesa->swtcl.vertex_attr_count].format = (STYLE);	\
+-   rmesa->swtcl.vertex_attr_count++;					\
++   rmesa->radeon.swtcl.vertex_attrs[rmesa->radeon.swtcl.vertex_attr_count].attrib = (ATTR);	\
++   rmesa->radeon.swtcl.vertex_attrs[rmesa->radeon.swtcl.vertex_attr_count].format = (STYLE);	\
++   rmesa->radeon.swtcl.vertex_attr_count++;					\
+    fmt_0 |= F0;								\
+ } while (0)
+ 
+ #define EMIT_PAD( N )							\
+ do {									\
+-   rmesa->swtcl.vertex_attrs[rmesa->swtcl.vertex_attr_count].attrib = 0;		\
+-   rmesa->swtcl.vertex_attrs[rmesa->swtcl.vertex_attr_count].format = EMIT_PAD;	\
+-   rmesa->swtcl.vertex_attrs[rmesa->swtcl.vertex_attr_count].offset = (N);		\
+-   rmesa->swtcl.vertex_attr_count++;					\
++   rmesa->radeon.swtcl.vertex_attrs[rmesa->radeon.swtcl.vertex_attr_count].attrib = 0;		\
++   rmesa->radeon.swtcl.vertex_attrs[rmesa->radeon.swtcl.vertex_attr_count].format = EMIT_PAD;	\
++   rmesa->radeon.swtcl.vertex_attrs[rmesa->radeon.swtcl.vertex_attr_count].offset = (N);		\
++   rmesa->radeon.swtcl.vertex_attr_count++;					\
+ } while (0)
+ 
+ static void r200SetVertexFormat( GLcontext *ctx )
+@@ -100,7 +97,7 @@ static void r200SetVertexFormat( GLcontext *ctx )
+    }
+ 
+    assert( VB->AttribPtr[VERT_ATTRIB_POS] != NULL );
+-   rmesa->swtcl.vertex_attr_count = 0;
++   rmesa->radeon.swtcl.vertex_attr_count = 0;
+ 
+    /* EMIT_ATTR's must be in order as they tell t_vertex.c how to
+     * build up a hardware vertex.
+@@ -185,7 +182,7 @@ static void r200SetVertexFormat( GLcontext *ctx )
+       rmesa->hw.ctx.cmd[CTX_PP_FOG_COLOR] |= R200_FOG_USE_SPEC_ALPHA;
+    }
+ 
+-   if (!RENDERINPUTS_EQUAL( rmesa->tnl_index_bitset, index_bitset ) ||
++   if (!RENDERINPUTS_EQUAL( rmesa->radeon.tnl_index_bitset, index_bitset ) ||
+ 	(rmesa->hw.vtx.cmd[VTX_VTXFMT_0] != fmt_0) ||
+ 	(rmesa->hw.vtx.cmd[VTX_VTXFMT_1] != fmt_1) ) {
+       R200_NEWPRIM(rmesa);
+@@ -193,26 +190,20 @@ static void r200SetVertexFormat( GLcontext *ctx )
+       rmesa->hw.vtx.cmd[VTX_VTXFMT_0] = fmt_0;
+       rmesa->hw.vtx.cmd[VTX_VTXFMT_1] = fmt_1;
+ 
+-      rmesa->swtcl.vertex_size =
++      rmesa->radeon.swtcl.vertex_size =
+ 	  _tnl_install_attrs( ctx,
+-			      rmesa->swtcl.vertex_attrs, 
+-			      rmesa->swtcl.vertex_attr_count,
++			      rmesa->radeon.swtcl.vertex_attrs, 
++			      rmesa->radeon.swtcl.vertex_attr_count,
+ 			      NULL, 0 );
+-      rmesa->swtcl.vertex_size /= 4;
+-      RENDERINPUTS_COPY( rmesa->tnl_index_bitset, index_bitset );
++      rmesa->radeon.swtcl.vertex_size /= 4;
++      RENDERINPUTS_COPY( rmesa->radeon.tnl_index_bitset, index_bitset );
+    }
+ }
+ 
+ 
+ static void r200RenderStart( GLcontext *ctx )
+ {
+-   r200ContextPtr rmesa = R200_CONTEXT( ctx );
+-
+    r200SetVertexFormat( ctx );
+-
+-   if (rmesa->dma.flush != 0 && 
+-       rmesa->dma.flush != flush_last_swtcl_prim)
+-      rmesa->dma.flush( rmesa );
+ }
+ 
+ 
+@@ -232,7 +223,7 @@ void r200ChooseVertexState( GLcontext *ctx )
+     * rasterization fallback.  As this function will be called again when we
+     * leave a rasterization fallback, we can just skip it for now.
+     */
+-   if (rmesa->Fallback != 0)
++   if (rmesa->radeon.Fallback != 0)
+       return;
+ 
+    vte = rmesa->hw.vte.cmd[VTE_SE_VTE_CNTL];
+@@ -273,78 +264,27 @@ void r200ChooseVertexState( GLcontext *ctx )
+    }
+ }
+ 
+-
+-/* Flush vertices in the current dma region.
+- */
+-static void flush_last_swtcl_prim( r200ContextPtr rmesa  )
+-{
+-   if (R200_DEBUG & DEBUG_IOCTL)
+-      fprintf(stderr, "%s\n", __FUNCTION__);
+-
+-   rmesa->dma.flush = NULL;
+-
+-   if (rmesa->dma.current.buf) {
+-      struct r200_dma_region *current = &rmesa->dma.current;
+-      GLuint current_offset = (rmesa->r200Screen->gart_buffer_offset +
+-			       current->buf->buf->idx * RADEON_BUFFER_SIZE + 
+-			       current->start);
+-
+-      assert (!(rmesa->swtcl.hw_primitive & R200_VF_PRIM_WALK_IND));
+-
+-      assert (current->start + 
+-	      rmesa->swtcl.numverts * rmesa->swtcl.vertex_size * 4 ==
+-	      current->ptr);
+-
+-      if (rmesa->dma.current.start != rmesa->dma.current.ptr) {
+-	 r200EnsureCmdBufSpace( rmesa, VERT_AOS_BUFSZ +
+-			        rmesa->hw.max_state_size + VBUF_BUFSZ );
+-	 r200EmitVertexAOS( rmesa,
+-			      rmesa->swtcl.vertex_size,
+-			      current_offset);
+-
+-	 r200EmitVbufPrim( rmesa,
+-			   rmesa->swtcl.hw_primitive,
+-			   rmesa->swtcl.numverts);
+-      }
+-
+-      rmesa->swtcl.numverts = 0;
+-      current->start = current->ptr;
+-   }
+-}
+-
+-
+-/* Alloc space in the current dma region.
+- */
+-static INLINE void *
+-r200AllocDmaLowVerts( r200ContextPtr rmesa, int nverts, int vsize )
++void r200_swtcl_flush(GLcontext *ctx, uint32_t current_offset)
+ {
+-   GLuint bytes = vsize * nverts;
+-
+-   if ( rmesa->dma.current.ptr + bytes > rmesa->dma.current.end ) 
+-      r200RefillCurrentDmaRegion( rmesa );
+-
+-   if (!rmesa->dma.flush) {
+-      rmesa->glCtx->Driver.NeedFlush |= FLUSH_STORED_VERTICES;
+-      rmesa->dma.flush = flush_last_swtcl_prim;
+-   }
++   r200ContextPtr rmesa = R200_CONTEXT(ctx);
++   rcommonEnsureCmdBufSpace(&rmesa->radeon,
++			    rmesa->radeon.hw.max_state_size + (12*sizeof(int)),
++			    __FUNCTION__);
+ 
+-   ASSERT( vsize == rmesa->swtcl.vertex_size * 4 );
+-   ASSERT( rmesa->dma.flush == flush_last_swtcl_prim );
+-   ASSERT( rmesa->dma.current.start + 
+-	   rmesa->swtcl.numverts * rmesa->swtcl.vertex_size * 4 ==
+-	   rmesa->dma.current.ptr );
+ 
++   radeonEmitState(&rmesa->radeon);
++   r200EmitVertexAOS( rmesa,
++		      rmesa->radeon.swtcl.vertex_size,
++		      rmesa->radeon.dma.current,
++		      current_offset);
+ 
+-   {
+-      GLubyte *head = (GLubyte *) (rmesa->dma.current.address + rmesa->dma.current.ptr);
+-      rmesa->dma.current.ptr += bytes;
+-      rmesa->swtcl.numverts += nverts;
+-      return head;
+-   }
++		      
++   r200EmitVbufPrim( rmesa,
++		     rmesa->radeon.swtcl.hw_primitive,
++		     rmesa->radeon.swtcl.numverts);
+ 
+ }
+ 
+-
+ /**************************************************************************/
+ 
+ 
+@@ -392,13 +332,13 @@ static void r200ResetLineStipple( GLcontext *ctx );
+ #undef LOCAL_VARS
+ #undef ALLOC_VERTS
+ #define CTX_ARG r200ContextPtr rmesa
+-#define GET_VERTEX_DWORDS() rmesa->swtcl.vertex_size
+-#define ALLOC_VERTS( n, size ) r200AllocDmaLowVerts( rmesa, n, size * 4 )
++#define GET_VERTEX_DWORDS() rmesa->radeon.swtcl.vertex_size
++#define ALLOC_VERTS( n, size ) rcommonAllocDmaLowVerts( &rmesa->radeon, n, size * 4 )
+ #define LOCAL_VARS						\
+    r200ContextPtr rmesa = R200_CONTEXT(ctx);		\
+-   const char *r200verts = (char *)rmesa->swtcl.verts;
+-#define VERT(x) (r200Vertex *)(r200verts + ((x) * vertsize * sizeof(int)))
+-#define VERTEX r200Vertex 
++   const char *r200verts = (char *)rmesa->radeon.swtcl.verts;
++#define VERT(x) (radeonVertex *)(r200verts + ((x) * vertsize * sizeof(int)))
++#define VERTEX radeonVertex 
+ #define DO_DEBUG_VERTS (1 && (R200_DEBUG & DEBUG_VERTS))
+ 
+ #undef TAG
+@@ -456,11 +396,11 @@ static struct {
+ #define VERT_Y(_v) _v->v.y
+ #define VERT_Z(_v) _v->v.z
+ #define AREA_IS_CCW( a ) (a < 0)
+-#define GET_VERTEX(e) (rmesa->swtcl.verts + (e*rmesa->swtcl.vertex_size*sizeof(int)))
++#define GET_VERTEX(e) (rmesa->radeon.swtcl.verts + (e*rmesa->radeon.swtcl.vertex_size*sizeof(int)))
+ 
+ #define VERT_SET_RGBA( v, c )  					\
+ do {								\
+-   r200_color_t *color = (r200_color_t *)&((v)->ui[coloroffset]);	\
++   radeon_color_t *color = (radeon_color_t *)&((v)->ui[coloroffset]);	\
+    UNCLAMPED_FLOAT_TO_UBYTE(color->red, (c)[0]);		\
+    UNCLAMPED_FLOAT_TO_UBYTE(color->green, (c)[1]);		\
+    UNCLAMPED_FLOAT_TO_UBYTE(color->blue, (c)[2]);		\
+@@ -472,7 +412,7 @@ do {								\
+ #define VERT_SET_SPEC( v, c )					\
+ do {								\
+    if (specoffset) {						\
+-      r200_color_t *spec = (r200_color_t *)&((v)->ui[specoffset]);	\
++      radeon_color_t *spec = (radeon_color_t *)&((v)->ui[specoffset]);	\
+       UNCLAMPED_FLOAT_TO_UBYTE(spec->red, (c)[0]);	\
+       UNCLAMPED_FLOAT_TO_UBYTE(spec->green, (c)[1]);	\
+       UNCLAMPED_FLOAT_TO_UBYTE(spec->blue, (c)[2]);	\
+@@ -481,8 +421,8 @@ do {								\
+ #define VERT_COPY_SPEC( v0, v1 )			\
+ do {							\
+    if (specoffset) {					\
+-      r200_color_t *spec0 = (r200_color_t *)&((v0)->ui[specoffset]);	\
+-      r200_color_t *spec1 = (r200_color_t *)&((v1)->ui[specoffset]);	\
++      radeon_color_t *spec0 = (radeon_color_t *)&((v0)->ui[specoffset]);	\
++      radeon_color_t *spec1 = (radeon_color_t *)&((v1)->ui[specoffset]);	\
+       spec0->red   = spec1->red;	\
+       spec0->green = spec1->green;	\
+       spec0->blue  = spec1->blue; 	\
+@@ -513,7 +453,7 @@ do {							\
+  ***********************************************************************/
+ 
+ #define RASTERIZE(x) r200RasterPrimitive( ctx, reduced_hw_prim(ctx, x) )
+-#define RENDER_PRIMITIVE rmesa->swtcl.render_primitive
++#define RENDER_PRIMITIVE rmesa->radeon.swtcl.render_primitive
+ #undef TAG
+ #define TAG(x) x
+ #include "tnl_dd/t_dd_unfilled.h"
+@@ -569,8 +509,8 @@ static void init_rast_tab( void )
+ #undef LOCAL_VARS
+ #define LOCAL_VARS						\
+    r200ContextPtr rmesa = R200_CONTEXT(ctx);		\
+-   const GLuint vertsize = rmesa->swtcl.vertex_size;		\
+-   const char *r200verts = (char *)rmesa->swtcl.verts;		\
++   const GLuint vertsize = rmesa->radeon.swtcl.vertex_size;		\
++   const char *r200verts = (char *)rmesa->radeon.swtcl.verts;		\
+    const GLuint * const elt = TNL_CONTEXT(ctx)->vb.Elts;	\
+    const GLboolean stipple = ctx->Line.StippleFlag;		\
+    (void) elt; (void) stipple;
+@@ -599,13 +539,13 @@ void r200ChooseRenderState( GLcontext *ctx )
+    GLuint index = 0;
+    GLuint flags = ctx->_TriangleCaps;
+ 
+-   if (!rmesa->TclFallback || rmesa->Fallback) 
++   if (!rmesa->radeon.TclFallback || rmesa->radeon.Fallback) 
+       return;
+ 
+    if (flags & DD_TRI_LIGHT_TWOSIDE) index |= R200_TWOSIDE_BIT;
+    if (flags & DD_TRI_UNFILLED)      index |= R200_UNFILLED_BIT;
+ 
+-   if (index != rmesa->swtcl.RenderIndex) {
++   if (index != rmesa->radeon.swtcl.RenderIndex) {
+       tnl->Driver.Render.Points = rast_tab[index].points;
+       tnl->Driver.Render.Line = rast_tab[index].line;
+       tnl->Driver.Render.ClippedLine = rast_tab[index].line;
+@@ -622,7 +562,7 @@ void r200ChooseRenderState( GLcontext *ctx )
+ 	 tnl->Driver.Render.ClippedPolygon = _tnl_RenderClippedPolygon;
+       }
+ 
+-      rmesa->swtcl.RenderIndex = index;
++      rmesa->radeon.swtcl.RenderIndex = index;
+    }
+ }
+ 
+@@ -636,7 +576,7 @@ static void r200RasterPrimitive( GLcontext *ctx, GLuint hwprim )
+ {
+    r200ContextPtr rmesa = R200_CONTEXT(ctx);
+ 
+-   if (rmesa->swtcl.hw_primitive != hwprim) {
++   if (rmesa->radeon.swtcl.hw_primitive != hwprim) {
+       /* need to disable perspective-correct texturing for point sprites */
+       if ((hwprim & 0xf) == R200_VF_PRIM_POINT_SPRITES && ctx->Point.PointSprite) {
+ 	 if (rmesa->hw.set.cmd[SET_RE_CNTL] & R200_PERSPECTIVE_ENABLE) {
+@@ -649,14 +589,14 @@ static void r200RasterPrimitive( GLcontext *ctx, GLuint hwprim )
+ 	 rmesa->hw.set.cmd[SET_RE_CNTL] |= R200_PERSPECTIVE_ENABLE;
+       }
+       R200_NEWPRIM( rmesa );
+-      rmesa->swtcl.hw_primitive = hwprim;
++      rmesa->radeon.swtcl.hw_primitive = hwprim;
+    }
+ }
+ 
+ static void r200RenderPrimitive( GLcontext *ctx, GLenum prim )
+ {
+    r200ContextPtr rmesa = R200_CONTEXT(ctx);
+-   rmesa->swtcl.render_primitive = prim;
++   rmesa->radeon.swtcl.render_primitive = prim;
+    if (prim < GL_TRIANGLES || !(ctx->_TriangleCaps & DD_TRI_UNFILLED)) 
+       r200RasterPrimitive( ctx, reduced_hw_prim(ctx, prim) );
+ }
+@@ -701,15 +641,15 @@ void r200Fallback( GLcontext *ctx, GLuint bit, GLboolean mode )
+ {
+    r200ContextPtr rmesa = R200_CONTEXT(ctx);
+    TNLcontext *tnl = TNL_CONTEXT(ctx);
+-   GLuint oldfallback = rmesa->Fallback;
++   GLuint oldfallback = rmesa->radeon.Fallback;
+ 
+    if (mode) {
+-      rmesa->Fallback |= bit;
++      rmesa->radeon.Fallback |= bit;
+       if (oldfallback == 0) {
+-	 R200_FIREVERTICES( rmesa );
++	 radeon_firevertices(&rmesa->radeon);
+ 	 TCL_FALLBACK( ctx, R200_TCL_FALLBACK_RASTER, GL_TRUE );
+ 	 _swsetup_Wakeup( ctx );
+-	 rmesa->swtcl.RenderIndex = ~0;
++	 rmesa->radeon.swtcl.RenderIndex = ~0;
+          if (R200_DEBUG & DEBUG_FALLBACKS) {
+             fprintf(stderr, "R200 begin rasterization fallback: 0x%x %s\n",
+                     bit, getFallbackString(bit));
+@@ -717,7 +657,7 @@ void r200Fallback( GLcontext *ctx, GLuint bit, GLboolean mode )
+       }
+    }
+    else {
+-      rmesa->Fallback &= ~bit;
++      rmesa->radeon.Fallback &= ~bit;
+       if (oldfallback == bit) {
+ 
+ 	 _swrast_flush( ctx );
+@@ -731,14 +671,14 @@ void r200Fallback( GLcontext *ctx, GLuint bit, GLboolean mode )
+ 
+ 	 tnl->Driver.Render.ResetLineStipple = r200ResetLineStipple;
+ 	 TCL_FALLBACK( ctx, R200_TCL_FALLBACK_RASTER, GL_FALSE );
+-	 if (rmesa->TclFallback) {
+-	    /* These are already done if rmesa->TclFallback goes to
++	 if (rmesa->radeon.TclFallback) {
++	    /* These are already done if rmesa->radeon.TclFallback goes to
+ 	     * zero above. But not if it doesn't (R200_NO_TCL for
+ 	     * example?)
+ 	     */
+ 	    _tnl_invalidate_vertex_state( ctx, ~0 );
+ 	    _tnl_invalidate_vertices( ctx, ~0 );
+-	    RENDERINPUTS_ZERO( rmesa->tnl_index_bitset );
++	    RENDERINPUTS_ZERO( rmesa->radeon.tnl_index_bitset );
+ 	    r200ChooseVertexState( ctx );
+ 	    r200ChooseRenderState( ctx );
+ 	 }
+@@ -772,7 +712,7 @@ r200PointsBitmap( GLcontext *ctx, GLint px, GLint py,
+    r200ContextPtr rmesa = R200_CONTEXT(ctx);
+    const GLfloat *rc = ctx->Current.RasterColor; 
+    GLint row, col;
+-   r200Vertex vert;
++   radeonVertex vert;
+    GLuint orig_vte;
+    GLuint h;
+ 
+@@ -794,7 +734,7 @@ r200PointsBitmap( GLcontext *ctx, GLint px, GLint py,
+       vte |= R200_VTX_W0_FMT;
+       vap &= ~R200_VAP_FORCE_W_TO_ONE;
+ 
+-      rmesa->swtcl.vertex_size = 5;
++      rmesa->radeon.swtcl.vertex_size = 5;
+ 
+       if ( (rmesa->hw.vtx.cmd[VTX_VTXFMT_0] != fmt_0)
+ 	   || (rmesa->hw.vtx.cmd[VTX_VTXFMT_1] != fmt_1) ) {
+@@ -871,10 +811,10 @@ r200PointsBitmap( GLcontext *ctx, GLint px, GLint py,
+ 
+    /* Update window height
+     */
+-   LOCK_HARDWARE( rmesa );
+-   UNLOCK_HARDWARE( rmesa );
+-   h = rmesa->dri.drawable->h + rmesa->dri.drawable->y;
+-   px += rmesa->dri.drawable->x;
++   LOCK_HARDWARE( &rmesa->radeon );
++   UNLOCK_HARDWARE( &rmesa->radeon );
++   h = rmesa->radeon.dri.drawable->h + rmesa->radeon.dri.drawable->y;
++   px += rmesa->radeon.dri.drawable->x;
+ 
+    /* Clipping handled by existing mechansims in r200_ioctl.c?
+     */
+@@ -929,7 +869,7 @@ r200PointsBitmap( GLcontext *ctx, GLint px, GLint py,
+ 
+    /* Need to restore vertexformat?
+     */
+-   if (rmesa->TclFallback)
++   if (rmesa->radeon.TclFallback)
+       r200ChooseVertexState( ctx );
+ }
+ 
+@@ -962,17 +902,13 @@ void r200InitSwtcl( GLcontext *ctx )
+    _tnl_init_vertices( ctx, ctx->Const.MaxArrayLockSize + 12, 
+ 		       36 * sizeof(GLfloat) );
+    
+-   rmesa->swtcl.verts = (GLubyte *)tnl->clipspace.vertex_buf;
+-   rmesa->swtcl.RenderIndex = ~0;
+-   rmesa->swtcl.render_primitive = GL_TRIANGLES;
+-   rmesa->swtcl.hw_primitive = 0;
++   rmesa->radeon.swtcl.verts = (GLubyte *)tnl->clipspace.vertex_buf;
++   rmesa->radeon.swtcl.RenderIndex = ~0;
++   rmesa->radeon.swtcl.render_primitive = GL_TRIANGLES;
++   rmesa->radeon.swtcl.hw_primitive = 0;
+ }
+ 
+ 
+ void r200DestroySwtcl( GLcontext *ctx )
+ {
+-   r200ContextPtr rmesa = R200_CONTEXT(ctx);
+-
+-   if (rmesa->swtcl.indexed_verts.buf) 
+-      r200ReleaseDmaRegion( rmesa, &rmesa->swtcl.indexed_verts, __FUNCTION__ );
+ }
+diff --git a/src/mesa/drivers/dri/r200/r200_swtcl.h b/src/mesa/drivers/dri/r200/r200_swtcl.h
+index 8c29fd0..a4051a4 100644
+--- a/src/mesa/drivers/dri/r200/r200_swtcl.h
++++ b/src/mesa/drivers/dri/r200/r200_swtcl.h
+@@ -52,15 +52,11 @@ extern void r200BuildVertices( GLcontext *ctx, GLuint start, GLuint count,
+ extern void r200PrintSetupFlags(char *msg, GLuint flags );
+ 
+ 
+-extern void r200_emit_indexed_verts( GLcontext *ctx,
+-				       GLuint start,
+-				       GLuint count );
+-
+ extern void r200_translate_vertex( GLcontext *ctx, 
+-				     const r200Vertex *src, 
++				     const radeonVertex *src, 
+ 				     SWvertex *dst );
+ 
+-extern void r200_print_vertex( GLcontext *ctx, const r200Vertex *v );
++extern void r200_print_vertex( GLcontext *ctx, const radeonVertex *v );
+ 
+ extern void r200_import_float_colors( GLcontext *ctx );
+ extern void r200_import_float_spec_colors( GLcontext *ctx );
+@@ -70,5 +66,5 @@ extern void r200PointsBitmap( GLcontext *ctx, GLint px, GLint py,
+ 			      const struct gl_pixelstore_attrib *unpack,
+ 			      const GLubyte *bitmap );
+ 
+-
++void r200_swtcl_flush(GLcontext *ctx, uint32_t current_offset);
+ #endif
+diff --git a/src/mesa/drivers/dri/r200/r200_tcl.c b/src/mesa/drivers/dri/r200/r200_tcl.c
+index 99aecfe..8e0fb14 100644
+--- a/src/mesa/drivers/dri/r200/r200_tcl.c
++++ b/src/mesa/drivers/dri/r200/r200_tcl.c
+@@ -123,7 +123,7 @@ static GLboolean discrete_prim[0x10] = {
+ 
+ #define RESET_STIPPLE() do {			\
+    R200_STATECHANGE( rmesa, lin );		\
+-   r200EmitState( rmesa );			\
++   radeonEmitState(&rmesa->radeon);			\
+ } while (0)
+ 
+ #define AUTO_STIPPLE( mode )  do {		\
+@@ -134,7 +134,7 @@ static GLboolean discrete_prim[0x10] = {
+    else						\
+       rmesa->hw.lin.cmd[LIN_RE_LINE_PATTERN] &=	\
+ 	 ~R200_LINE_PATTERN_AUTO_RESET;	\
+-   r200EmitState( rmesa );			\
++   radeonEmitState(&rmesa->radeon);			\
+ } while (0)
+ 
+ 
+@@ -142,25 +142,23 @@ static GLboolean discrete_prim[0x10] = {
+ 
+ static GLushort *r200AllocElts( r200ContextPtr rmesa, GLuint nr ) 
+ {
+-   if (rmesa->dma.flush == r200FlushElts &&
+-       rmesa->store.cmd_used + nr*2 < R200_CMD_BUF_SZ) {
++   if (rmesa->radeon.dma.flush == r200FlushElts &&
++       rmesa->tcl.elt_used + nr*2 < R200_ELT_BUF_SZ) {
+ 
+-      GLushort *dest = (GLushort *)(rmesa->store.cmd_buf +
+-				    rmesa->store.cmd_used);
++      GLushort *dest = (GLushort *)(rmesa->tcl.elt_dma_bo->ptr +
++				    rmesa->tcl.elt_used);
+ 
+-      rmesa->store.cmd_used += nr*2;
++      rmesa->tcl.elt_used += nr*2;
+ 
+       return dest;
+    }
+    else {
+-      if (rmesa->dma.flush)
+-	 rmesa->dma.flush( rmesa );
++      if (rmesa->radeon.dma.flush)
++	 rmesa->radeon.dma.flush( rmesa->radeon.glCtx );
+ 
+-      r200EnsureCmdBufSpace( rmesa, AOS_BUFSZ(rmesa->tcl.nr_aos_components) +
+-			     rmesa->hw.max_state_size + ELTS_BUFSZ(nr) );
++      rcommonEnsureCmdBufSpace(&rmesa->radeon, AOS_BUFSZ(rmesa->tcl.nr_aos_components), __FUNCTION__);
+ 
+       r200EmitAOS( rmesa,
+-		   rmesa->tcl.aos_components,
+ 		   rmesa->tcl.nr_aos_components, 0 );
+ 
+       return r200AllocEltsOpenEnded( rmesa, rmesa->tcl.hw_primitive, nr );
+@@ -188,13 +186,14 @@ static void r200EmitPrim( GLcontext *ctx,
+    r200ContextPtr rmesa = R200_CONTEXT( ctx );
+    r200TclPrimitive( ctx, prim, hwprim );
+    
+-   r200EnsureCmdBufSpace( rmesa, AOS_BUFSZ(rmesa->tcl.nr_aos_components) +
+-			  rmesa->hw.max_state_size + VBUF_BUFSZ );
++   //   fprintf(stderr,"Emit prim %d\n", rmesa->tcl.nr_aos_components);
++   rcommonEnsureCmdBufSpace( &rmesa->radeon,
++			     AOS_BUFSZ(rmesa->tcl.nr_aos_components) +
++			     rmesa->radeon.hw.max_state_size + VBUF_BUFSZ, __FUNCTION__ );
+ 
+    r200EmitAOS( rmesa,
+-		  rmesa->tcl.aos_components,
+-		  rmesa->tcl.nr_aos_components,
+-		  start );
++		rmesa->tcl.nr_aos_components,
++		start );
+    
+    /* Why couldn't this packet have taken an offset param?
+     */
+@@ -394,7 +393,7 @@ static GLboolean r200_run_tcl_render( GLcontext *ctx,
+ 
+    /* TODO: separate this from the swtnl pipeline 
+     */
+-   if (rmesa->TclFallback)
++   if (rmesa->radeon.TclFallback)
+       return GL_TRUE;	/* fallback to software t&l */
+ 
+    if (R200_DEBUG & DEBUG_PRIMS)
+@@ -405,8 +404,9 @@ static GLboolean r200_run_tcl_render( GLcontext *ctx,
+ 
+    /* Validate state:
+     */
+-   if (rmesa->NewGLState)
+-      r200ValidateState( ctx );
++   if (rmesa->radeon.NewGLState)
++      if (!r200ValidateState( ctx ))
++         return GL_TRUE; /* fallback to sw t&l */
+ 
+    if (!ctx->VertexProgram._Enabled) {
+    /* NOTE: inputs != tnl->render_inputs - these are the untransformed
+@@ -565,15 +565,11 @@ static void transition_to_hwtnl( GLcontext *ctx )
+ 
+    tnl->Driver.NotifyMaterialChange = r200UpdateMaterial;
+ 
+-   if ( rmesa->dma.flush )			
+-      rmesa->dma.flush( rmesa );	
++   if ( rmesa->radeon.dma.flush )			
++      rmesa->radeon.dma.flush( rmesa->radeon.glCtx );	
+ 
+-   rmesa->dma.flush = NULL;
++   rmesa->radeon.dma.flush = NULL;
+    
+-   if (rmesa->swtcl.indexed_verts.buf) 
+-      r200ReleaseDmaRegion( rmesa, &rmesa->swtcl.indexed_verts, 
+-			      __FUNCTION__ );
+-
+    R200_STATECHANGE( rmesa, vap );
+    rmesa->hw.vap.cmd[VAP_SE_VAP_CNTL] |= R200_VAP_TCL_ENABLE;
+    rmesa->hw.vap.cmd[VAP_SE_VAP_CNTL] &= ~R200_VAP_FORCE_W_TO_ONE;
+@@ -631,10 +627,10 @@ static char *getFallbackString(GLuint bit)
+ void r200TclFallback( GLcontext *ctx, GLuint bit, GLboolean mode )
+ {
+    r200ContextPtr rmesa = R200_CONTEXT(ctx);
+-   GLuint oldfallback = rmesa->TclFallback;
++   GLuint oldfallback = rmesa->radeon.TclFallback;
+ 
+    if (mode) {
+-      rmesa->TclFallback |= bit;
++      rmesa->radeon.TclFallback |= bit;
+       if (oldfallback == 0) {
+ 	 if (R200_DEBUG & DEBUG_FALLBACKS) 
+ 	    fprintf(stderr, "R200 begin tcl fallback %s\n",
+@@ -643,7 +639,7 @@ void r200TclFallback( GLcontext *ctx, GLuint bit, GLboolean mode )
+       }
+    }
+    else {
+-      rmesa->TclFallback &= ~bit;
++      rmesa->radeon.TclFallback &= ~bit;
+       if (oldfallback == bit) {
+ 	 if (R200_DEBUG & DEBUG_FALLBACKS) 
+ 	    fprintf(stderr, "R200 end tcl fallback %s\n",
+diff --git a/src/mesa/drivers/dri/r200/r200_tex.c b/src/mesa/drivers/dri/r200/r200_tex.c
+index 5a4db33..19a6cad 100644
+--- a/src/mesa/drivers/dri/r200/r200_tex.c
++++ b/src/mesa/drivers/dri/r200/r200_tex.c
+@@ -43,8 +43,7 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ #include "main/teximage.h"
+ #include "main/texobj.h"
+ 
+-#include "texmem.h"
+-
++#include "radeon_mipmap_tree.h"
+ #include "r200_context.h"
+ #include "r200_state.h"
+ #include "r200_ioctl.h"
+@@ -63,10 +62,11 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+  * \param twrap Wrap mode for the \a t texture coordinate
+  */
+ 
+-static void r200SetTexWrap( r200TexObjPtr t, GLenum swrap, GLenum twrap, GLenum rwrap )
++static void r200SetTexWrap( radeonTexObjPtr t, GLenum swrap, GLenum twrap, GLenum rwrap )
+ {
+    GLboolean  is_clamp = GL_FALSE;
+    GLboolean  is_clamp_to_border = GL_FALSE;
++   struct gl_texture_object *tObj = &t->base;
+ 
+    t->pp_txfilter &= ~(R200_CLAMP_S_MASK | R200_CLAMP_T_MASK | R200_BORDER_MODE_D3D);
+ 
+@@ -103,7 +103,7 @@ static void r200SetTexWrap( r200TexObjPtr t, GLenum swrap, GLenum twrap, GLenum
+       _mesa_problem(NULL, "bad S wrap mode in %s", __FUNCTION__);
+    }
+ 
+-   if (t->base.tObj->Target != GL_TEXTURE_1D) {
++   if (tObj->Target != GL_TEXTURE_1D) {
+       switch ( twrap ) {
+       case GL_REPEAT:
+          t->pp_txfilter |= R200_CLAMP_T_WRAP;
+@@ -180,7 +180,7 @@ static void r200SetTexWrap( r200TexObjPtr t, GLenum swrap, GLenum twrap, GLenum
+    t->border_fallback = (is_clamp && is_clamp_to_border);
+ }
+ 
+-static void r200SetTexMaxAnisotropy( r200TexObjPtr t, GLfloat max )
++static void r200SetTexMaxAnisotropy( radeonTexObjPtr t, GLfloat max )
+ {
+    t->pp_txfilter &= ~R200_MAX_ANISO_MASK;
+ 
+@@ -205,10 +205,13 @@ static void r200SetTexMaxAnisotropy( r200TexObjPtr t, GLfloat max )
+  * \param magf Texture magnification mode
+  */
+ 
+-static void r200SetTexFilter( r200TexObjPtr t, GLenum minf, GLenum magf )
++static void r200SetTexFilter( radeonTexObjPtr t, GLenum minf, GLenum magf )
+ {
+    GLuint anisotropy = (t->pp_txfilter & R200_MAX_ANISO_MASK);
+ 
++   /* Force revalidation to account for switches from/to mipmapping. */
++   t->validated = GL_FALSE;
++
+    t->pp_txfilter &= ~(R200_MIN_FILTER_MASK | R200_MAG_FILTER_MASK);
+    t->pp_txformat_x &= ~R200_VOLUME_FILTER_MASK;
+ 
+@@ -267,693 +270,12 @@ static void r200SetTexFilter( r200TexObjPtr t, GLenum minf, GLenum magf )
+    }
+ }
+ 
+-static void r200SetTexBorderColor( r200TexObjPtr t, GLubyte c[4] )
+-{
+-   t->pp_border_color = r200PackColor( 4, c[0], c[1], c[2], c[3] );
+-}
+-
+-
+-/**
+- * Allocate space for and load the mesa images into the texture memory block.
+- * This will happen before drawing with a new texture, or drawing with a
+- * texture after it was swapped out or teximaged again.
+- */
+-
+-static r200TexObjPtr r200AllocTexObj( struct gl_texture_object *texObj )
+-{
+-   r200TexObjPtr t;
+-
+-   t = CALLOC_STRUCT( r200_tex_obj );
+-   texObj->DriverData = t;
+-   if ( t != NULL ) {
+-      if ( R200_DEBUG & DEBUG_TEXTURE ) {
+-	 fprintf( stderr, "%s( %p, %p )\n", __FUNCTION__, (void *)texObj, 
+-		  (void *)t );
+-      }
+-
+-      /* Initialize non-image-dependent parts of the state:
+-       */
+-      t->base.tObj = texObj;
+-      t->border_fallback = GL_FALSE;
+-
+-      make_empty_list( & t->base );
+-
+-      r200SetTexWrap( t, texObj->WrapS, texObj->WrapT, texObj->WrapR );
+-      r200SetTexMaxAnisotropy( t, texObj->MaxAnisotropy );
+-      r200SetTexFilter( t, texObj->MinFilter, texObj->MagFilter );
+-      r200SetTexBorderColor( t, texObj->_BorderChan );
+-   }
+-
+-   return t;
+-}
+-
+-/* try to find a format which will only need a memcopy */
+-static const struct gl_texture_format *
+-r200Choose8888TexFormat( GLenum srcFormat, GLenum srcType )
+-{
+-   const GLuint ui = 1;
+-   const GLubyte littleEndian = *((const GLubyte *) &ui);
+-
+-   if ((srcFormat == GL_RGBA && srcType == GL_UNSIGNED_INT_8_8_8_8) ||
+-       (srcFormat == GL_RGBA && srcType == GL_UNSIGNED_BYTE && !littleEndian) ||
+-       (srcFormat == GL_ABGR_EXT && srcType == GL_UNSIGNED_INT_8_8_8_8_REV) ||
+-       (srcFormat == GL_ABGR_EXT && srcType == GL_UNSIGNED_BYTE && littleEndian)) {
+-      return &_mesa_texformat_rgba8888;
+-   }
+-   else if ((srcFormat == GL_RGBA && srcType == GL_UNSIGNED_INT_8_8_8_8_REV) ||
+-       (srcFormat == GL_RGBA && srcType == GL_UNSIGNED_BYTE && littleEndian) ||
+-       (srcFormat == GL_ABGR_EXT && srcType == GL_UNSIGNED_INT_8_8_8_8) ||
+-       (srcFormat == GL_ABGR_EXT && srcType == GL_UNSIGNED_BYTE && !littleEndian)) {
+-      return &_mesa_texformat_rgba8888_rev;
+-   }
+-   else return _dri_texformat_argb8888;
+-}
+-
+-static const struct gl_texture_format *
+-r200ChooseTextureFormat( GLcontext *ctx, GLint internalFormat,
+-                           GLenum format, GLenum type )
+-{
+-   r200ContextPtr rmesa = R200_CONTEXT(ctx);
+-   const GLboolean do32bpt =
+-       ( rmesa->texture_depth == DRI_CONF_TEXTURE_DEPTH_32 );
+-   const GLboolean force16bpt =
+-       ( rmesa->texture_depth == DRI_CONF_TEXTURE_DEPTH_FORCE_16 );
+-   (void) format;
+-
+-   switch ( internalFormat ) {
+-   case 4:
+-   case GL_RGBA:
+-   case GL_COMPRESSED_RGBA:
+-      switch ( type ) {
+-      case GL_UNSIGNED_INT_10_10_10_2:
+-      case GL_UNSIGNED_INT_2_10_10_10_REV:
+-	 return do32bpt ? _dri_texformat_argb8888 : _dri_texformat_argb1555;
+-      case GL_UNSIGNED_SHORT_4_4_4_4:
+-      case GL_UNSIGNED_SHORT_4_4_4_4_REV:
+-	 return _dri_texformat_argb4444;
+-      case GL_UNSIGNED_SHORT_5_5_5_1:
+-      case GL_UNSIGNED_SHORT_1_5_5_5_REV:
+-	 return _dri_texformat_argb1555;
+-      default:
+-         return do32bpt ?
+-	    r200Choose8888TexFormat(format, type) : _dri_texformat_argb4444;
+-      }
+-
+-   case 3:
+-   case GL_RGB:
+-   case GL_COMPRESSED_RGB:
+-      switch ( type ) {
+-      case GL_UNSIGNED_SHORT_4_4_4_4:
+-      case GL_UNSIGNED_SHORT_4_4_4_4_REV:
+-	 return _dri_texformat_argb4444;
+-      case GL_UNSIGNED_SHORT_5_5_5_1:
+-      case GL_UNSIGNED_SHORT_1_5_5_5_REV:
+-	 return _dri_texformat_argb1555;
+-      case GL_UNSIGNED_SHORT_5_6_5:
+-      case GL_UNSIGNED_SHORT_5_6_5_REV:
+-	 return _dri_texformat_rgb565;
+-      default:
+-         return do32bpt ? _dri_texformat_argb8888 : _dri_texformat_rgb565;
+-      }
+-
+-   case GL_RGBA8:
+-   case GL_RGB10_A2:
+-   case GL_RGBA12:
+-   case GL_RGBA16:
+-      return !force16bpt ?
+-	  r200Choose8888TexFormat(format, type) : _dri_texformat_argb4444;
+-
+-   case GL_RGBA4:
+-   case GL_RGBA2:
+-      return _dri_texformat_argb4444;
+-
+-   case GL_RGB5_A1:
+-      return _dri_texformat_argb1555;
+-
+-   case GL_RGB8:
+-   case GL_RGB10:
+-   case GL_RGB12:
+-   case GL_RGB16:
+-      return !force16bpt ? _dri_texformat_argb8888 : _dri_texformat_rgb565;
+-
+-   case GL_RGB5:
+-   case GL_RGB4:
+-   case GL_R3_G3_B2:
+-      return _dri_texformat_rgb565;
+-
+-   case GL_ALPHA:
+-   case GL_ALPHA4:
+-   case GL_ALPHA8:
+-   case GL_ALPHA12:
+-   case GL_ALPHA16:
+-   case GL_COMPRESSED_ALPHA:
+-   /* can't use a8 format since interpreting hw I8 as a8 would result
+-      in wrong rgb values (same as alpha value instead of 0). */
+-      return _dri_texformat_al88;
+-
+-   case 1:
+-   case GL_LUMINANCE:
+-   case GL_LUMINANCE4:
+-   case GL_LUMINANCE8:
+-   case GL_LUMINANCE12:
+-   case GL_LUMINANCE16:
+-   case GL_COMPRESSED_LUMINANCE:
+-      return _dri_texformat_l8;
+-
+-   case 2:
+-   case GL_LUMINANCE_ALPHA:
+-   case GL_LUMINANCE4_ALPHA4:
+-   case GL_LUMINANCE6_ALPHA2:
+-   case GL_LUMINANCE8_ALPHA8:
+-   case GL_LUMINANCE12_ALPHA4:
+-   case GL_LUMINANCE12_ALPHA12:
+-   case GL_LUMINANCE16_ALPHA16:
+-   case GL_COMPRESSED_LUMINANCE_ALPHA:
+-      return _dri_texformat_al88;
+-
+-   case GL_INTENSITY:
+-   case GL_INTENSITY4:
+-   case GL_INTENSITY8:
+-   case GL_INTENSITY12:
+-   case GL_INTENSITY16:
+-   case GL_COMPRESSED_INTENSITY:
+-       return _dri_texformat_i8;
+-
+-   case GL_YCBCR_MESA:
+-      if (type == GL_UNSIGNED_SHORT_8_8_APPLE ||
+-          type == GL_UNSIGNED_BYTE)
+-         return &_mesa_texformat_ycbcr;
+-      else
+-         return &_mesa_texformat_ycbcr_rev;
+-
+-   case GL_RGB_S3TC:
+-   case GL_RGB4_S3TC:
+-   case GL_COMPRESSED_RGB_S3TC_DXT1_EXT:
+-      return &_mesa_texformat_rgb_dxt1;
+-
+-   case GL_COMPRESSED_RGBA_S3TC_DXT1_EXT:
+-      return &_mesa_texformat_rgba_dxt1;
+-
+-   case GL_RGBA_S3TC:
+-   case GL_RGBA4_S3TC:
+-   case GL_COMPRESSED_RGBA_S3TC_DXT3_EXT:
+-      return &_mesa_texformat_rgba_dxt3;
+-
+-   case GL_COMPRESSED_RGBA_S3TC_DXT5_EXT:
+-      return &_mesa_texformat_rgba_dxt5;
+-
+-   default:
+-      _mesa_problem(ctx,
+-         "unexpected internalFormat 0x%x in r200ChooseTextureFormat",
+-         (int) internalFormat);
+-      return NULL;
+-   }
+-
+-   return NULL; /* never get here */
+-}
+-
+-
+-static GLboolean
+-r200ValidateClientStorage( GLcontext *ctx, GLenum target,
+-			   GLint internalFormat,
+-			   GLint srcWidth, GLint srcHeight, 
+-                           GLenum format, GLenum type,  const void *pixels,
+-			   const struct gl_pixelstore_attrib *packing,
+-			   struct gl_texture_object *texObj,
+-			   struct gl_texture_image *texImage)
+-
+-{
+-   r200ContextPtr rmesa = R200_CONTEXT(ctx);
+-
+-   if ( R200_DEBUG & DEBUG_TEXTURE )
+-      fprintf(stderr, "intformat %s format %s type %s\n",
+-	      _mesa_lookup_enum_by_nr( internalFormat ),
+-	      _mesa_lookup_enum_by_nr( format ),
+-	      _mesa_lookup_enum_by_nr( type ));
+-
+-   if (!ctx->Unpack.ClientStorage)
+-      return 0;
+-
+-   if (ctx->_ImageTransferState ||
+-       texImage->IsCompressed ||
+-       texObj->GenerateMipmap)
+-      return 0;
+-
+-
+-   /* This list is incomplete, may be different on ppc???
+-    */
+-   switch ( internalFormat ) {
+-   case GL_RGBA:
+-      if ( format == GL_BGRA && type == GL_UNSIGNED_INT_8_8_8_8_REV ) {
+-	 texImage->TexFormat = _dri_texformat_argb8888;
+-      }
+-      else
+-	 return 0;
+-      break;
+-
+-   case GL_RGB:
+-      if ( format == GL_RGB && type == GL_UNSIGNED_SHORT_5_6_5 ) {
+-	 texImage->TexFormat = _dri_texformat_rgb565;
+-      }
+-      else
+-	 return 0;
+-      break;
+-
+-   case GL_YCBCR_MESA:
+-      if ( format == GL_YCBCR_MESA && 
+-	   type == GL_UNSIGNED_SHORT_8_8_REV_APPLE ) {
+-	 texImage->TexFormat = &_mesa_texformat_ycbcr_rev;
+-      }
+-      else if ( format == GL_YCBCR_MESA && 
+-		(type == GL_UNSIGNED_SHORT_8_8_APPLE || 
+-		 type == GL_UNSIGNED_BYTE)) {
+-	 texImage->TexFormat = &_mesa_texformat_ycbcr;
+-      }
+-      else
+-	 return 0;
+-      break;
+-
+-   default:
+-      return 0;
+-   }
+-
+-   /* Could deal with these packing issues, but currently don't:
+-    */
+-   if (packing->SkipPixels || 
+-       packing->SkipRows || 
+-       packing->SwapBytes ||
+-       packing->LsbFirst) {
+-      return 0;
+-   }
+-
+-   {      
+-      GLint srcRowStride = _mesa_image_row_stride(packing, srcWidth,
+-						  format, type);
+-
+-      
+-      if ( R200_DEBUG & DEBUG_TEXTURE )
+-	 fprintf(stderr, "%s: srcRowStride %d/%x\n", 
+-		 __FUNCTION__, srcRowStride, srcRowStride);
+-
+-      /* Could check this later in upload, pitch restrictions could be
+-       * relaxed, but would need to store the image pitch somewhere,
+-       * as packing details might change before image is uploaded:
+-       */
+-      if (!r200IsGartMemory( rmesa, pixels, srcHeight * srcRowStride ) ||
+-	  (srcRowStride & 63))
+-	 return 0;
+-
+-
+-      /* Have validated that _mesa_transfer_teximage would be a straight
+-       * memcpy at this point.  NOTE: future calls to TexSubImage will
+-       * overwrite the client data.  This is explicitly mentioned in the
+-       * extension spec.
+-       */
+-      texImage->Data = (void *)pixels;
+-      texImage->IsClientData = GL_TRUE;
+-      texImage->RowStride = srcRowStride / texImage->TexFormat->TexelBytes;
+-
+-      return 1;
+-   }
+-}
+-
+-
+-static void r200TexImage1D( GLcontext *ctx, GLenum target, GLint level,
+-                              GLint internalFormat,
+-                              GLint width, GLint border,
+-                              GLenum format, GLenum type, const GLvoid *pixels,
+-                              const struct gl_pixelstore_attrib *packing,
+-                              struct gl_texture_object *texObj,
+-                              struct gl_texture_image *texImage )
+-{
+-   driTextureObject * t = (driTextureObject *) texObj->DriverData;
+-
+-   if ( t ) {
+-      driSwapOutTextureObject( t );
+-   }
+-   else {
+-      t = (driTextureObject *) r200AllocTexObj( texObj );
+-      if (!t) {
+-         _mesa_error(ctx, GL_OUT_OF_MEMORY, "glTexImage1D");
+-         return;
+-      }
+-   }
+-
+-   /* Note, this will call ChooseTextureFormat */
+-   _mesa_store_teximage1d(ctx, target, level, internalFormat,
+-                          width, border, format, type, pixels,
+-                          &ctx->Unpack, texObj, texImage);
+-
+-   t->dirty_images[0] |= (1 << level);
+-}
+-
+-
+-static void r200TexSubImage1D( GLcontext *ctx, GLenum target, GLint level,
+-                                 GLint xoffset,
+-                                 GLsizei width,
+-                                 GLenum format, GLenum type,
+-                                 const GLvoid *pixels,
+-                                 const struct gl_pixelstore_attrib *packing,
+-                                 struct gl_texture_object *texObj,
+-                                 struct gl_texture_image *texImage )
+-{
+-   driTextureObject * t = (driTextureObject *) texObj->DriverData;
+-
+-   assert( t ); /* this _should_ be true */
+-   if ( t ) {
+-      driSwapOutTextureObject( t );
+-   }
+-   else {
+-      t = (driTextureObject *) r200AllocTexObj( texObj );
+-      if (!t) {
+-         _mesa_error(ctx, GL_OUT_OF_MEMORY, "glTexSubImage1D");
+-         return;
+-      }
+-   }
+-
+-   _mesa_store_texsubimage1d(ctx, target, level, xoffset, width,
+-			     format, type, pixels, packing, texObj,
+-			     texImage);
+-
+-   t->dirty_images[0] |= (1 << level);
+-}
+-
+-
+-static void r200TexImage2D( GLcontext *ctx, GLenum target, GLint level,
+-                              GLint internalFormat,
+-                              GLint width, GLint height, GLint border,
+-                              GLenum format, GLenum type, const GLvoid *pixels,
+-                              const struct gl_pixelstore_attrib *packing,
+-                              struct gl_texture_object *texObj,
+-                              struct gl_texture_image *texImage )
+-{
+-   driTextureObject * t = (driTextureObject *) texObj->DriverData;
+-   GLuint face;
+-
+-   /* which cube face or ordinary 2D image */
+-   switch (target) {
+-   case GL_TEXTURE_CUBE_MAP_POSITIVE_X:
+-   case GL_TEXTURE_CUBE_MAP_NEGATIVE_X:
+-   case GL_TEXTURE_CUBE_MAP_POSITIVE_Y:
+-   case GL_TEXTURE_CUBE_MAP_NEGATIVE_Y:
+-   case GL_TEXTURE_CUBE_MAP_POSITIVE_Z:
+-   case GL_TEXTURE_CUBE_MAP_NEGATIVE_Z:
+-      face = (GLuint) target - (GLuint) GL_TEXTURE_CUBE_MAP_POSITIVE_X;
+-      ASSERT(face < 6);
+-      break;
+-   default:
+-      face = 0;
+-   }
+-
+-   if ( t != NULL ) {
+-      driSwapOutTextureObject( t );
+-   }
+-   else {
+-      t = (driTextureObject *) r200AllocTexObj( texObj );
+-      if (!t) {
+-         _mesa_error(ctx, GL_OUT_OF_MEMORY, "glTexImage2D");
+-         return;
+-      }
+-   }
+-
+-   texImage->IsClientData = GL_FALSE;
+-
+-   if (r200ValidateClientStorage( ctx, target, 
+-				  internalFormat, 
+-				  width, height, 
+-				  format, type, pixels, 
+-				  packing, texObj, texImage)) {
+-      if (R200_DEBUG & DEBUG_TEXTURE)
+-	 fprintf(stderr, "%s: Using client storage\n", __FUNCTION__); 
+-   }
+-   else {
+-      if (R200_DEBUG & DEBUG_TEXTURE)
+-	 fprintf(stderr, "%s: Using normal storage\n", __FUNCTION__); 
+-
+-      /* Normal path: copy (to cached memory) and eventually upload
+-       * via another copy to GART memory and then a blit...  Could
+-       * eliminate one copy by going straight to (permanent) GART.
+-       *
+-       * Note, this will call r200ChooseTextureFormat.
+-       */
+-      _mesa_store_teximage2d(ctx, target, level, internalFormat,
+-			     width, height, border, format, type, pixels,
+-			     &ctx->Unpack, texObj, texImage);
+-      
+-      t->dirty_images[face] |= (1 << level);
+-   }
+-}
+-
+-
+-static void r200TexSubImage2D( GLcontext *ctx, GLenum target, GLint level,
+-                                 GLint xoffset, GLint yoffset,
+-                                 GLsizei width, GLsizei height,
+-                                 GLenum format, GLenum type,
+-                                 const GLvoid *pixels,
+-                                 const struct gl_pixelstore_attrib *packing,
+-                                 struct gl_texture_object *texObj,
+-                                 struct gl_texture_image *texImage )
+-{
+-   driTextureObject * t = (driTextureObject *) texObj->DriverData;
+-   GLuint face;
+-
+-   /* which cube face or ordinary 2D image */
+-   switch (target) {
+-   case GL_TEXTURE_CUBE_MAP_POSITIVE_X:
+-   case GL_TEXTURE_CUBE_MAP_NEGATIVE_X:
+-   case GL_TEXTURE_CUBE_MAP_POSITIVE_Y:
+-   case GL_TEXTURE_CUBE_MAP_NEGATIVE_Y:
+-   case GL_TEXTURE_CUBE_MAP_POSITIVE_Z:
+-   case GL_TEXTURE_CUBE_MAP_NEGATIVE_Z:
+-      face = (GLuint) target - (GLuint) GL_TEXTURE_CUBE_MAP_POSITIVE_X;
+-      ASSERT(face < 6);
+-      break;
+-   default:
+-      face = 0;
+-   }
+-
+-   assert( t ); /* this _should_ be true */
+-   if ( t ) {
+-      driSwapOutTextureObject( t );
+-   }
+-   else {
+-      t = (driTextureObject *) r200AllocTexObj( texObj );
+-      if (!t) {
+-         _mesa_error(ctx, GL_OUT_OF_MEMORY, "glTexSubImage2D");
+-         return;
+-      }
+-   }
+-
+-   _mesa_store_texsubimage2d(ctx, target, level, xoffset, yoffset, width,
+-			     height, format, type, pixels, packing, texObj,
+-			     texImage);
+-
+-   t->dirty_images[face] |= (1 << level);
+-}
+-
+-
+-static void r200CompressedTexImage2D( GLcontext *ctx, GLenum target, GLint level,
+-                              GLint internalFormat,
+-                              GLint width, GLint height, GLint border,
+-                              GLsizei imageSize, const GLvoid *data,
+-                              struct gl_texture_object *texObj,
+-                              struct gl_texture_image *texImage )
++static void r200SetTexBorderColor( radeonTexObjPtr t, GLubyte c[4] )
+ {
+-   driTextureObject * t = (driTextureObject *) texObj->DriverData;
+-   GLuint face;
+-
+-   /* which cube face or ordinary 2D image */
+-   switch (target) {
+-   case GL_TEXTURE_CUBE_MAP_POSITIVE_X:
+-   case GL_TEXTURE_CUBE_MAP_NEGATIVE_X:
+-   case GL_TEXTURE_CUBE_MAP_POSITIVE_Y:
+-   case GL_TEXTURE_CUBE_MAP_NEGATIVE_Y:
+-   case GL_TEXTURE_CUBE_MAP_POSITIVE_Z:
+-   case GL_TEXTURE_CUBE_MAP_NEGATIVE_Z:
+-      face = (GLuint) target - (GLuint) GL_TEXTURE_CUBE_MAP_POSITIVE_X;
+-      ASSERT(face < 6);
+-      break;
+-   default:
+-      face = 0;
+-   }
+-
+-   if ( t != NULL ) {
+-      driSwapOutTextureObject( t );
+-   }
+-   else {
+-      t = (driTextureObject *) r200AllocTexObj( texObj );
+-      if (!t) {
+-         _mesa_error(ctx, GL_OUT_OF_MEMORY, "glCompressedTexImage2D");
+-         return;
+-      }
+-   }
+-
+-   texImage->IsClientData = GL_FALSE;
+-/* can't call this, different parameters. Would never evaluate to true anyway currently
+-   if (r200ValidateClientStorage( ctx, target, 
+-				  internalFormat,
+-				  width, height,
+-				  format, type, pixels,
+-				  packing, texObj, texImage)) {
+-      if (R200_DEBUG & DEBUG_TEXTURE)
+-	 fprintf(stderr, "%s: Using client storage\n", __FUNCTION__);
+-   }
+-   else */{
+-      if (R200_DEBUG & DEBUG_TEXTURE)
+-	 fprintf(stderr, "%s: Using normal storage\n", __FUNCTION__);
+-
+-      /* Normal path: copy (to cached memory) and eventually upload
+-       * via another copy to GART memory and then a blit...  Could
+-       * eliminate one copy by going straight to (permanent) GART.
+-       *
+-       * Note, this will call r200ChooseTextureFormat.
+-       */
+-      _mesa_store_compressed_teximage2d(ctx, target, level, internalFormat, width,
+-                                 height, border, imageSize, data, texObj, texImage);
+-
+-      t->dirty_images[face] |= (1 << level);
+-   }
+-}
+-
+-
+-static void r200CompressedTexSubImage2D( GLcontext *ctx, GLenum target, GLint level,
+-                                 GLint xoffset, GLint yoffset,
+-                                 GLsizei width, GLsizei height,
+-                                 GLenum format,
+-                                 GLsizei imageSize, const GLvoid *data,
+-                                 struct gl_texture_object *texObj,
+-                                 struct gl_texture_image *texImage )
+-{
+-   driTextureObject * t = (driTextureObject *) texObj->DriverData;
+-   GLuint face;
+-
+-
+-   /* which cube face or ordinary 2D image */
+-   switch (target) {
+-   case GL_TEXTURE_CUBE_MAP_POSITIVE_X:
+-   case GL_TEXTURE_CUBE_MAP_NEGATIVE_X:
+-   case GL_TEXTURE_CUBE_MAP_POSITIVE_Y:
+-   case GL_TEXTURE_CUBE_MAP_NEGATIVE_Y:
+-   case GL_TEXTURE_CUBE_MAP_POSITIVE_Z:
+-   case GL_TEXTURE_CUBE_MAP_NEGATIVE_Z:
+-      face = (GLuint) target - (GLuint) GL_TEXTURE_CUBE_MAP_POSITIVE_X;
+-      ASSERT(face < 6);
+-      break;
+-   default:
+-      face = 0;
+-   }
+-
+-   assert( t ); /* this _should_ be true */
+-   if ( t ) {
+-      driSwapOutTextureObject( t );
+-   }
+-   else {
+-      t = (driTextureObject *) r200AllocTexObj( texObj );
+-      if (!t) {
+-         _mesa_error(ctx, GL_OUT_OF_MEMORY, "glCompressedTexSubImage2D");
+-         return;
+-      }
+-   }
+-
+-   _mesa_store_compressed_texsubimage2d(ctx, target, level, xoffset, yoffset, width,
+-                            height, format, imageSize, data, texObj, texImage);
+-
+-   t->dirty_images[face] |= (1 << level);
+-}
+-
+-
+-#if ENABLE_HW_3D_TEXTURE
+-static void r200TexImage3D( GLcontext *ctx, GLenum target, GLint level,
+-                            GLint internalFormat,
+-                            GLint width, GLint height, GLint depth,
+-                            GLint border,
+-                            GLenum format, GLenum type, const GLvoid *pixels,
+-                            const struct gl_pixelstore_attrib *packing,
+-                            struct gl_texture_object *texObj,
+-                            struct gl_texture_image *texImage )
+-{
+-   driTextureObject * t = (driTextureObject *) texObj->DriverData;
+-
+-   if ( t ) {
+-      driSwapOutTextureObject( t );
+-   }
+-   else {
+-      t = (driTextureObject *) r200AllocTexObj( texObj );
+-      if (!t) {
+-         _mesa_error(ctx, GL_OUT_OF_MEMORY, "glTexImage3D");
+-         return;
+-      }
+-   }
+-
+-   texImage->IsClientData = GL_FALSE;
+-
+-#if 0
+-   if (r200ValidateClientStorage( ctx, target, 
+-				  internalFormat, 
+-				  width, height, 
+-				  format, type, pixels, 
+-				  packing, texObj, texImage)) {
+-      if (R200_DEBUG & DEBUG_TEXTURE)
+-	 fprintf(stderr, "%s: Using client storage\n", __FUNCTION__); 
+-   }
+-   else
+-#endif
+-   {
+-      if (R200_DEBUG & DEBUG_TEXTURE)
+-	 fprintf(stderr, "%s: Using normal storage\n", __FUNCTION__); 
+-
+-      /* Normal path: copy (to cached memory) and eventually upload
+-       * via another copy to GART memory and then a blit...  Could
+-       * eliminate one copy by going straight to (permanent) GART.
+-       *
+-       * Note, this will call r200ChooseTextureFormat.
+-       */
+-      _mesa_store_teximage3d(ctx, target, level, internalFormat,
+-			     width, height, depth, border,
+-                             format, type, pixels,
+-			     &ctx->Unpack, texObj, texImage);
+-      
+-      t->dirty_images[0] |= (1 << level);
+-   }
++   t->pp_border_color = radeonPackColor( 4, c[0], c[1], c[2], c[3] );
+ }
+-#endif
+-
+ 
+-#if ENABLE_HW_3D_TEXTURE
+-static void
+-r200TexSubImage3D( GLcontext *ctx, GLenum target, GLint level,
+-                   GLint xoffset, GLint yoffset, GLint zoffset,
+-                   GLsizei width, GLsizei height, GLsizei depth,
+-                   GLenum format, GLenum type,
+-                   const GLvoid *pixels,
+-                   const struct gl_pixelstore_attrib *packing,
+-                   struct gl_texture_object *texObj,
+-                   struct gl_texture_image *texImage )
+-{
+-   driTextureObject * t = (driTextureObject *) texObj->DriverData;
+-
+-/*     fprintf(stderr, "%s\n", __FUNCTION__); */
+-
+-   assert( t ); /* this _should_ be true */
+-   if ( t ) {
+-      driSwapOutTextureObject( t );
+-   }
+-   else {
+-      t = (driTextureObject *) r200AllocTexObj( texObj );
+-      if (!t) {
+-         _mesa_error(ctx, GL_OUT_OF_MEMORY, "glTexSubImage3D");
+-         return;
+-      }
+-      texObj->DriverData = t;
+-   }
+ 
+-   _mesa_store_texsubimage3d(ctx, target, level, xoffset, yoffset, zoffset,
+-                             width, height, depth,
+-                             format, type, pixels, packing, texObj, texImage);
+-
+-   t->dirty_images[0] |= (1 << level);
+-}
+-#endif
+ 
+ 
+ 
+@@ -978,7 +300,7 @@ static void r200TexEnv( GLcontext *ctx, GLenum target,
+       GLubyte c[4];
+       GLuint envColor;
+       UNCLAMPED_FLOAT_TO_RGBA_CHAN( c, texUnit->EnvColor );
+-      envColor = r200PackColor( 4, c[0], c[1], c[2], c[3] );
++      envColor = radeonPackColor( 4, c[0], c[1], c[2], c[3] );
+       if ( rmesa->hw.tf.cmd[TF_TFACTOR_0 + unit] != envColor ) {
+ 	 R200_STATECHANGE( rmesa, tf );
+ 	 rmesa->hw.tf.cmd[TF_TFACTOR_0 + unit] = envColor;
+@@ -997,7 +319,7 @@ static void r200TexEnv( GLcontext *ctx, GLenum target,
+        * NOTE: Add a small bias to the bias for conform mipsel.c test.
+        */
+       bias = *param + .01;
+-      min = driQueryOptionb (&rmesa->optionCache, "no_neg_lod_bias") ?
++      min = driQueryOptionb (&rmesa->radeon.optionCache, "no_neg_lod_bias") ?
+ 	  0.0 : -16.0;
+       bias = CLAMP( bias, min, 16.0 );
+       b = (int)(bias * fixed_one) & R200_LOD_BIAS_MASK;
+@@ -1034,7 +356,7 @@ static void r200TexParameter( GLcontext *ctx, GLenum target,
+ 				struct gl_texture_object *texObj,
+ 				GLenum pname, const GLfloat *params )
+ {
+-   r200TexObjPtr t = (r200TexObjPtr) texObj->DriverData;
++   radeonTexObj* t = radeon_tex_obj(texObj);
+ 
+    if ( R200_DEBUG & (DEBUG_STATE|DEBUG_TEXTURE) ) {
+       fprintf( stderr, "%s( %s )\n", __FUNCTION__,
+@@ -1068,59 +390,46 @@ static void r200TexParameter( GLcontext *ctx, GLenum target,
+        * we just have to rely on loading the right subset of mipmap levels
+        * to simulate a clamped LOD.
+        */
+-      driSwapOutTextureObject( (driTextureObject *) t );
++      if (t->mt) {
++         radeon_miptree_unreference(t->mt);
++	 t->mt = 0;
++	 t->validated = GL_FALSE;
++      }
+       break;
+ 
+    default:
+       return;
+    }
+-
+-   /* Mark this texobj as dirty (one bit per tex unit)
+-    */
+-   t->dirty_state = TEX_ALL;
+ }
+ 
+ 
+-
+-static void r200BindTexture( GLcontext *ctx, GLenum target,
+-			       struct gl_texture_object *texObj )
+-{
+-   if ( R200_DEBUG & (DEBUG_STATE|DEBUG_TEXTURE) ) {
+-      fprintf( stderr, "%s( %p ) unit=%d\n", __FUNCTION__, (void *)texObj,
+-	       ctx->Texture.CurrentUnit );
+-   }
+-
+-   if ( (target == GL_TEXTURE_1D)
+-	|| (target == GL_TEXTURE_2D) 
+-#if ENABLE_HW_3D_TEXTURE
+-	|| (target == GL_TEXTURE_3D)
+-#endif
+-	|| (target == GL_TEXTURE_CUBE_MAP)
+-	|| (target == GL_TEXTURE_RECTANGLE_NV) ) {
+-      assert( texObj->DriverData != NULL );
+-   }
+-}
+-
+-
+-static void r200DeleteTexture( GLcontext *ctx,
+-				 struct gl_texture_object *texObj )
++static void r200DeleteTexture(GLcontext * ctx, struct gl_texture_object *texObj)
+ {
+    r200ContextPtr rmesa = R200_CONTEXT(ctx);
+-   driTextureObject * t = (driTextureObject *) texObj->DriverData;
+-
+-   if ( R200_DEBUG & (DEBUG_STATE|DEBUG_TEXTURE) ) {
+-      fprintf( stderr, "%s( %p (target = %s) )\n", __FUNCTION__, (void *)texObj,
+-	       _mesa_lookup_enum_by_nr( texObj->Target ) );
++   radeonTexObj* t = radeon_tex_obj(texObj);
++
++   if (RADEON_DEBUG & (DEBUG_STATE | DEBUG_TEXTURE)) {
++      fprintf(stderr, "%s( %p (target = %s) )\n", __FUNCTION__,
++	      (void *)texObj,
++	      _mesa_lookup_enum_by_nr(texObj->Target));
++   }
++   
++   if (rmesa) {
++      int i;
++      radeon_firevertices(&rmesa->radeon);
++      for ( i = 0 ; i < rmesa->radeon.glCtx->Const.MaxTextureUnits ; i++ ) {
++	 if ( t == rmesa->state.texture.unit[i].texobj ) {
++	    rmesa->state.texture.unit[i].texobj = NULL;
++	    rmesa->hw.tex[i].dirty = GL_FALSE;
++	    rmesa->hw.cube[i].dirty = GL_FALSE;
++	 }
++      }      
+    }
+-
+-   if ( t != NULL ) {
+-      if ( rmesa ) {
+-         R200_FIREVERTICES( rmesa );
+-      }
+-
+-      driDestroyTextureObject( t );
++   
++   if (t->mt) {
++      radeon_miptree_unreference(t->mt);
++      t->mt = 0;
+    }
+-   /* Free mipmap images and the texture object itself */
+    _mesa_delete_texture_object(ctx, texObj);
+ }
+ 
+@@ -1150,46 +459,59 @@ static void r200TexGen( GLcontext *ctx,
+  * Called via ctx->Driver.NewTextureObject.
+  * Note: this function will be called during context creation to
+  * allocate the default texture objects.
+- * Note: we could use containment here to 'derive' the driver-specific
+- * texture object from the core mesa gl_texture_object.  Not done at this time.
+  * Fixup MaxAnisotropy according to user preference.
+  */
+-static struct gl_texture_object *
+-r200NewTextureObject( GLcontext *ctx, GLuint name, GLenum target )
++static struct gl_texture_object *r200NewTextureObject(GLcontext * ctx,
++						      GLuint name,
++						      GLenum target)
+ {
+    r200ContextPtr rmesa = R200_CONTEXT(ctx);
+-   struct gl_texture_object *obj;
+-   obj = _mesa_new_texture_object(ctx, name, target);
+-   if (!obj)
+-      return NULL;
+-   obj->MaxAnisotropy = rmesa->initialMaxAnisotropy;
+-   r200AllocTexObj( obj );
+-   return obj;
++   radeonTexObj* t = CALLOC_STRUCT(radeon_tex_obj);
++
++
++   if (RADEON_DEBUG & (DEBUG_STATE | DEBUG_TEXTURE)) {
++     fprintf(stderr, "%s( %p (target = %s) )\n", __FUNCTION__,
++	     t, _mesa_lookup_enum_by_nr(target));
++   }
 +
-+static struct bo_legacy *bo_allocate(struct bo_manager_legacy *boml,
-+                                     uint32_t size,
-+                                     uint32_t alignment,
-+                                     uint32_t domains,
-+                                     uint32_t flags)
++   _mesa_initialize_texture_object(&t->base, name, target);
++   t->base.MaxAnisotropy = rmesa->radeon.initialMaxAnisotropy;
++
++   /* Initialize hardware state */
++   r200SetTexWrap( t, t->base.WrapS, t->base.WrapT, t->base.WrapR );
++   r200SetTexMaxAnisotropy( t, t->base.MaxAnisotropy );
++   r200SetTexFilter(t, t->base.MinFilter, t->base.MagFilter);
++   r200SetTexBorderColor(t, t->base._BorderChan);
++
++   return &t->base;
+ }
+ 
+ 
++
+ void r200InitTextureFuncs( struct dd_function_table *functions )
+ {
+    /* Note: we only plug in the functions we implement in the driver
+     * since _mesa_init_driver_functions() was already called.
+     */
+-   functions->ChooseTextureFormat	= r200ChooseTextureFormat;
+-   functions->TexImage1D		= r200TexImage1D;
+-   functions->TexImage2D		= r200TexImage2D;
++   functions->ChooseTextureFormat	= radeonChooseTextureFormat;
++   functions->TexImage1D		= radeonTexImage1D;
++   functions->TexImage2D		= radeonTexImage2D;
+ #if ENABLE_HW_3D_TEXTURE
+-   functions->TexImage3D		= r200TexImage3D;
++   functions->TexImage3D		= radeonTexImage3D;
+ #else
+    functions->TexImage3D		= _mesa_store_teximage3d;
+ #endif
+-   functions->TexSubImage1D		= r200TexSubImage1D;
+-   functions->TexSubImage2D		= r200TexSubImage2D;
++   functions->TexSubImage1D		= radeonTexSubImage1D;
++   functions->TexSubImage2D		= radeonTexSubImage2D;
+ #if ENABLE_HW_3D_TEXTURE
+-   functions->TexSubImage3D		= r200TexSubImage3D;
++   functions->TexSubImage3D		= radeonTexSubImage3D;
+ #else
+    functions->TexSubImage3D		= _mesa_store_texsubimage3d;
+ #endif
++   functions->GetTexImage               = radeonGetTexImage;
++   functions->GetCompressedTexImage     = radeonGetCompressedTexImage;
+    functions->NewTextureObject		= r200NewTextureObject;
+-   functions->BindTexture		= r200BindTexture;
++   //   functions->BindTexture		= r200BindTexture;
+    functions->DeleteTexture		= r200DeleteTexture;
+    functions->IsTextureResident		= driIsTextureResident;
+ 
+@@ -1197,22 +519,16 @@ void r200InitTextureFuncs( struct dd_function_table *functions )
+    functions->TexParameter		= r200TexParameter;
+    functions->TexGen			= r200TexGen;
+ 
+-   functions->CompressedTexImage2D	= r200CompressedTexImage2D;
+-   functions->CompressedTexSubImage2D	= r200CompressedTexSubImage2D;
++   functions->CompressedTexImage2D	= radeonCompressedTexImage2D;
++   functions->CompressedTexSubImage2D	= radeonCompressedTexSubImage2D;
++
++   functions->GenerateMipmap = radeonGenerateMipmap;
++
++   functions->NewTextureImage = radeonNewTextureImage;
++   functions->FreeTexImageData = radeonFreeTexImageData;
++   functions->MapTexture = radeonMapTexture;
++   functions->UnmapTexture = radeonUnmapTexture;
+ 
+    driInitTextureFormats();
+ 
+-#if 000
+-   /* moved or obsolete code */
+-   r200ContextPtr rmesa = R200_CONTEXT(ctx);
+-   driInitTextureObjects( ctx, & rmesa->swapped,
+-			  DRI_TEXMGR_DO_TEXTURE_1D
+-			  | DRI_TEXMGR_DO_TEXTURE_2D );
+-
+-   /* Hack: r200NewTextureObject is not yet installed when the
+-    * default textures are created. Therefore set MaxAnisotropy of the
+-    * default 2D texture now. */
+-   ctx->Shared->Default2D->MaxAnisotropy = driQueryOptionf (&rmesa->optionCache,
+-							    "def_max_anisotropy");
+-#endif
+ }
+diff --git a/src/mesa/drivers/dri/r200/r200_tex.h b/src/mesa/drivers/dri/r200/r200_tex.h
+index 10ff8e8..55592ed 100644
+--- a/src/mesa/drivers/dri/r200/r200_tex.h
++++ b/src/mesa/drivers/dri/r200/r200_tex.h
+@@ -41,9 +41,9 @@ extern void r200SetTexOffset(__DRIcontext *pDRICtx, GLint texname,
+ 
+ extern void r200UpdateTextureState( GLcontext *ctx );
+ 
+-extern int r200UploadTexImages( r200ContextPtr rmesa, r200TexObjPtr t, GLuint face );
++extern int r200UploadTexImages( r200ContextPtr rmesa, radeonTexObjPtr t, GLuint face );
+ 
+-extern void r200DestroyTexObj( r200ContextPtr rmesa, r200TexObjPtr t );
++extern void r200DestroyTexObj( r200ContextPtr rmesa, radeonTexObjPtr t );
+ 
+ extern void r200InitTextureFuncs( struct dd_function_table *functions );
+ 
+diff --git a/src/mesa/drivers/dri/r200/r200_texmem.c b/src/mesa/drivers/dri/r200/r200_texmem.c
+deleted file mode 100644
+index 3b81ac0..0000000
+--- a/src/mesa/drivers/dri/r200/r200_texmem.c
++++ /dev/null
+@@ -1,530 +0,0 @@
+-/**************************************************************************
+-
+-Copyright (C) Tungsten Graphics 2002.  All Rights Reserved.  
+-The Weather Channel, Inc. funded Tungsten Graphics to develop the
+-initial release of the Radeon 8500 driver under the XFree86
+-license. This notice must be preserved.
+-
+-Permission is hereby granted, free of charge, to any person obtaining
+-a copy of this software and associated documentation files (the
+-"Software"), to deal in the Software without restriction, including
+-without limitation on the rights to use, copy, modify, merge, publish,
+-distribute, sub license, and/or sell copies of the Software, and to
+-permit persons to whom the Software is furnished to do so, subject to
+-the following conditions:
+-
+-The above copyright notice and this permission notice (including the
+-next paragraph) shall be included in all copies or substantial
+-portions of the Software.
+-
+-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+-EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+-MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+-NON-INFRINGEMENT. IN NO EVENT SHALL ATI, VA LINUX SYSTEMS AND/OR THEIR
+-SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
+-IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
+-IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+-SOFTWARE.
+-
+-**************************************************************************/
+-
+-/*
+- * Authors:
+- *   Kevin E. Martin <martin@valinux.com>
+- *   Gareth Hughes <gareth@valinux.com>
+- *
+- */
+- 
+-#include <errno.h>
+-
+-#include "main/glheader.h"
+-#include "main/imports.h"
+-#include "main/context.h"
+-#include "main/colormac.h"
+-#include "main/macros.h"
+-#include "r200_context.h"
+-#include "r200_ioctl.h"
+-#include "r200_tex.h"
+-#include "radeon_reg.h"
+-
+-#include <unistd.h>  /* for usleep() */
+-
+-
+-/**
+- * Destroy any device-dependent state associated with the texture.  This may
+- * include NULLing out hardware state that points to the texture.
+- */
+-void
+-r200DestroyTexObj( r200ContextPtr rmesa, r200TexObjPtr t )
+-{
+-   if ( R200_DEBUG & DEBUG_TEXTURE ) {
+-      fprintf( stderr, "%s( %p, %p )\n", __FUNCTION__, 
+-	       (void *)t, (void *)t->base.tObj );
+-   }
+-
+-   if ( rmesa != NULL ) {
+-      unsigned   i;
+-
+-
+-      for ( i = 0 ; i < rmesa->glCtx->Const.MaxTextureUnits ; i++ ) {
+-	 if ( t == rmesa->state.texture.unit[i].texobj ) {
+-	    rmesa->state.texture.unit[i].texobj = NULL;
+-	    rmesa->hw.tex[i].dirty = GL_FALSE;
+-	    rmesa->hw.cube[i].dirty = GL_FALSE;
+-	 }
+-      }
+-   }
+-}
+-
+-
+-/* ------------------------------------------------------------
+- * Texture image conversions
+- */
+-
+-
+-static void r200UploadGARTClientSubImage( r200ContextPtr rmesa,
+-					  r200TexObjPtr t, 
+-					  struct gl_texture_image *texImage,
+-					  GLint hwlevel,
+-					  GLint x, GLint y, 
+-					  GLint width, GLint height )
+-{
+-   const struct gl_texture_format *texFormat = texImage->TexFormat;
+-   GLuint srcPitch, dstPitch;
+-   int blit_format;
+-   int srcOffset;
+-
+-   /*
+-    * XXX it appears that we always upload the full image, not a subimage.
+-    * I.e. x==0, y==0, width=texWidth, height=texWidth.  If this is ever
+-    * changed, the src pitch will have to change.
+-    */
+-   switch ( texFormat->TexelBytes ) {
+-   case 1:
+-      blit_format = R200_CP_COLOR_FORMAT_CI8;
+-      srcPitch = t->image[0][0].width * texFormat->TexelBytes;
+-      dstPitch = t->image[0][0].width * texFormat->TexelBytes;
+-      break;
+-   case 2:
+-      blit_format = R200_CP_COLOR_FORMAT_RGB565;
+-      srcPitch = t->image[0][0].width * texFormat->TexelBytes;
+-      dstPitch = t->image[0][0].width * texFormat->TexelBytes;
+-      break;
+-   case 4:
+-      blit_format = R200_CP_COLOR_FORMAT_ARGB8888;
+-      srcPitch = t->image[0][0].width * texFormat->TexelBytes;
+-      dstPitch = t->image[0][0].width * texFormat->TexelBytes;
+-      break;
+-   default:
+-      return;
+-   }
+-
+-   t->image[0][hwlevel].data = texImage->Data;
+-   srcOffset = r200GartOffsetFromVirtual( rmesa, texImage->Data );
+-
+-   assert( srcOffset != ~0 );
+-
+-   /* Don't currently need to cope with small pitches?
+-    */
+-   width = texImage->Width;
+-   height = texImage->Height;
+-
+-   r200EmitWait( rmesa, RADEON_WAIT_3D );
+-
+-   r200EmitBlit( rmesa, blit_format, 
+-		 srcPitch,  
+-		 srcOffset,   
+-		 dstPitch,
+-		 t->bufAddr,
+-		 x, 
+-		 y, 
+-		 t->image[0][hwlevel].x + x,
+-		 t->image[0][hwlevel].y + y, 
+-		 width,
+-		 height );
+-
+-   r200EmitWait( rmesa, RADEON_WAIT_2D );
+-}
+-
+-static void r200UploadRectSubImage( r200ContextPtr rmesa,
+-				    r200TexObjPtr t, 
+-				    struct gl_texture_image *texImage,
+-				    GLint x, GLint y, 
+-				    GLint width, GLint height )
+-{
+-   const struct gl_texture_format *texFormat = texImage->TexFormat;
+-   int blit_format, dstPitch, done;
+-
+-   switch ( texFormat->TexelBytes ) {
+-   case 1:
+-      blit_format = R200_CP_COLOR_FORMAT_CI8;
+-      break;
+-   case 2:
+-      blit_format = R200_CP_COLOR_FORMAT_RGB565;
+-      break;
+-   case 4:
+-      blit_format = R200_CP_COLOR_FORMAT_ARGB8888;
+-      break;
+-   default:
+-      return;
+-   }
+-
+-   t->image[0][0].data = texImage->Data;
+-
+-   /* Currently don't need to cope with small pitches.
+-    */
+-   width = texImage->Width;
+-   height = texImage->Height;
+-   dstPitch = t->pp_txpitch + 32;
+-
+-   if (rmesa->prefer_gart_client_texturing && texImage->IsClientData) {
+-      /* In this case, could also use GART texturing.  This is
+-       * currently disabled, but has been tested & works.
+-       */
+-      if ( !t->image_override )
+-         t->pp_txoffset = r200GartOffsetFromVirtual( rmesa, texImage->Data );
+-      t->pp_txpitch = texImage->RowStride * texFormat->TexelBytes - 32;
+-
+-      if (R200_DEBUG & DEBUG_TEXTURE)
+-	 fprintf(stderr, 
+-		 "Using GART texturing for rectangular client texture\n");
+-
+-      /* Release FB memory allocated for this image:
+-       */
+-      /* FIXME This may not be correct as driSwapOutTextureObject sets
+-       * FIXME dirty_images.  It may be fine, though.
+-       */
+-      if ( t->base.memBlock ) {
+-	 driSwapOutTextureObject( (driTextureObject *) t );
+-      }
+-   }
+-   else if (texImage->IsClientData) {
+-      /* Data already in GART memory, with usable pitch.
+-       */
+-      GLuint srcPitch;
+-      srcPitch = texImage->RowStride * texFormat->TexelBytes;
+-      r200EmitBlit( rmesa, 
+-		    blit_format, 
+-		    srcPitch,
+-		    r200GartOffsetFromVirtual( rmesa, texImage->Data ),   
+-		    dstPitch, t->bufAddr,
+-		    0, 0, 
+-		    0, 0, 
+-		    width, height );
+-   }
+-   else {
+-      /* Data not in GART memory, or bad pitch.
+-       */
+-      for (done = 0; done < height ; ) {
+-	 struct r200_dma_region region;
+-	 int lines = MIN2( height - done, RADEON_BUFFER_SIZE / dstPitch );
+-	 int src_pitch;
+-	 char *tex;
+-
+-         src_pitch = texImage->RowStride * texFormat->TexelBytes;
+-
+-	 tex = (char *)texImage->Data + done * src_pitch;
+-
+-	 memset(&region, 0, sizeof(region));
+-	 r200AllocDmaRegion( rmesa, &region, lines * dstPitch, 1024 );
+-
+-	 /* Copy texdata to dma:
+-	  */
+-	 if (0)
+-	    fprintf(stderr, "%s: src_pitch %d dst_pitch %d\n",
+-		    __FUNCTION__, src_pitch, dstPitch);
+-
+-	 if (src_pitch == dstPitch) {
+-	    memcpy( region.address + region.start, tex, lines * src_pitch );
+-	 } 
+-	 else {
+-	    char *buf = region.address + region.start;
+-	    int i;
+-	    for (i = 0 ; i < lines ; i++) {
+-	       memcpy( buf, tex, src_pitch );
+-	       buf += dstPitch;
+-	       tex += src_pitch;
+-	    }
+-	 }
+-
+-	 r200EmitWait( rmesa, RADEON_WAIT_3D );
+-
+-	 /* Blit to framebuffer
+-	  */
+-	 r200EmitBlit( rmesa,
+-		       blit_format,
+-		       dstPitch, GET_START( &region ),
+-		       dstPitch | (t->tile_bits >> 16),
+-		       t->bufAddr,
+-		       0, 0,
+-		       0, done,
+-		       width, lines );
+-	 
+-	 r200EmitWait( rmesa, RADEON_WAIT_2D );
+-
+-	 r200ReleaseDmaRegion( rmesa, &region, __FUNCTION__ );
+-	 done += lines;
+-      }
+-   }
+-}
+-
+-
+-/**
+- * Upload the texture image associated with texture \a t at the specified
+- * level at the address relative to \a start.
+- */
+-static void uploadSubImage( r200ContextPtr rmesa, r200TexObjPtr t, 
+-			    GLint hwlevel,
+-			    GLint x, GLint y, GLint width, GLint height,
+-			    GLuint face )
+-{
+-   struct gl_texture_image *texImage = NULL;
+-   GLuint offset;
+-   GLint imageWidth, imageHeight;
+-   GLint ret;
+-   drm_radeon_texture_t tex;
+-   drm_radeon_tex_image_t tmp;
+-   const int level = hwlevel + t->base.firstLevel;
+-
+-   if ( R200_DEBUG & DEBUG_TEXTURE ) {
+-      fprintf( stderr, "%s( %p, %p ) level/width/height/face = %d/%d/%d/%u\n", 
+-	       __FUNCTION__, (void *)t, (void *)t->base.tObj,
+-	       level, width, height, face );
+-   }
+-
+-   ASSERT(face < 6);
+-
+-   /* Ensure we have a valid texture to upload */
+-   if ( ( hwlevel < 0 ) || ( hwlevel >= RADEON_MAX_TEXTURE_LEVELS ) ) {
+-      _mesa_problem(NULL, "bad texture level in %s", __FUNCTION__);
+-      return;
+-   }
+-
+-   texImage = t->base.tObj->Image[face][level];
+-
+-   if ( !texImage ) {
+-      if ( R200_DEBUG & DEBUG_TEXTURE )
+-	 fprintf( stderr, "%s: texImage %d is NULL!\n", __FUNCTION__, level );
+-      return;
+-   }
+-   if ( !texImage->Data ) {
+-      if ( R200_DEBUG & DEBUG_TEXTURE )
+-	 fprintf( stderr, "%s: image data is NULL!\n", __FUNCTION__ );
+-      return;
+-   }
+-
+-
+-   if (t->base.tObj->Target == GL_TEXTURE_RECTANGLE_NV) {
+-      assert(level == 0);
+-      assert(hwlevel == 0);
+-      if ( R200_DEBUG & DEBUG_TEXTURE )
+-	 fprintf( stderr, "%s: image data is rectangular\n", __FUNCTION__);
+-      r200UploadRectSubImage( rmesa, t, texImage, x, y, width, height );
+-      return;
+-   }
+-   else if (texImage->IsClientData) {
+-      if ( R200_DEBUG & DEBUG_TEXTURE )
+-	 fprintf( stderr, "%s: image data is in GART client storage\n",
+-		  __FUNCTION__);
+-      r200UploadGARTClientSubImage( rmesa, t, texImage, hwlevel,
+-				   x, y, width, height );
+-      return;
+-   }
+-   else if ( R200_DEBUG & DEBUG_TEXTURE )
+-      fprintf( stderr, "%s: image data is in normal memory\n",
+-	       __FUNCTION__);
+-      
+-
+-   imageWidth = texImage->Width;
+-   imageHeight = texImage->Height;
+-
+-   offset = t->bufAddr + t->base.totalSize / 6 * face;
+-
+-   if ( R200_DEBUG & (DEBUG_TEXTURE|DEBUG_IOCTL) ) {
+-      GLint imageX = 0;
+-      GLint imageY = 0;
+-      GLint blitX = t->image[face][hwlevel].x;
+-      GLint blitY = t->image[face][hwlevel].y;
+-      GLint blitWidth = t->image[face][hwlevel].width;
+-      GLint blitHeight = t->image[face][hwlevel].height;
+-      fprintf( stderr, "   upload image: %d,%d at %d,%d\n",
+-	       imageWidth, imageHeight, imageX, imageY );
+-      fprintf( stderr, "   upload  blit: %d,%d at %d,%d\n",
+-	       blitWidth, blitHeight, blitX, blitY );
+-      fprintf( stderr, "       blit ofs: 0x%07x level: %d/%d\n",
+-	       (GLuint)offset, hwlevel, level );
+-   }
+-
+-   t->image[face][hwlevel].data = texImage->Data;
+-
+-   /* Init the DRM_RADEON_TEXTURE command / drm_radeon_texture_t struct.
+-    * NOTE: we're always use a 1KB-wide blit and I8 texture format.
+-    * We used to use 1, 2 and 4-byte texels and used to use the texture
+-    * width to dictate the blit width - but that won't work for compressed
+-    * textures. (Brian)
+-    * NOTE: can't do that with texture tiling. (sroland)
+-    */
+-   tex.offset = offset;
+-   tex.image = &tmp;
+-   /* copy (x,y,width,height,data) */
+-   memcpy( &tmp, &t->image[face][hwlevel], sizeof(tmp) );
+-   
+-   if (texImage->TexFormat->TexelBytes) {
+-      /* use multi-byte upload scheme */
+-      tex.height = imageHeight;
+-      tex.width = imageWidth;
+-      tex.format = t->pp_txformat & R200_TXFORMAT_FORMAT_MASK;
+-      if (tex.format == R200_TXFORMAT_ABGR8888) {
+-	 /* drm will refuse abgr8888 textures. */
+-	 tex.format = R200_TXFORMAT_ARGB8888;
+-      }
+-      tex.pitch = MAX2((texImage->Width * texImage->TexFormat->TexelBytes) / 64, 1);
+-      tex.offset += tmp.x & ~1023;
+-      tmp.x = tmp.x % 1024;
+-      if (t->tile_bits & R200_TXO_MICRO_TILE) {
+-	 /* need something like "tiled coordinates" ? */
+-	 tmp.y = tmp.x / (tex.pitch * 128) * 2;
+-	 tmp.x = tmp.x % (tex.pitch * 128) / 2 / texImage->TexFormat->TexelBytes;
+-	 tex.pitch |= RADEON_DST_TILE_MICRO >> 22;
+-      }
+-      else {
+-	 tmp.x = tmp.x >> (texImage->TexFormat->TexelBytes >> 1);
+-      }
+-      if ((t->tile_bits & R200_TXO_MACRO_TILE) &&
+-	 (texImage->Width * texImage->TexFormat->TexelBytes >= 256) &&
+-	 ((!(t->tile_bits & R200_TXO_MICRO_TILE) && (texImage->Height >= 8)) ||
+-	    (texImage->Height >= 16))) {
+-	 /* weird: R200 disables macro tiling if mip width is smaller than 256 bytes,
+-	    OR if height is smaller than 8 automatically, but if micro tiling is active
+-	    the limit is height 16 instead ? */
+-	 tex.pitch |= RADEON_DST_TILE_MACRO >> 22;
+-      }
+-   }
+-   else {
+-      /* In case of for instance 8x8 texture (2x2 dxt blocks), padding after the first two blocks is
+-         needed (only with dxt1 since 2 dxt3/dxt5 blocks already use 32 Byte). */
+-      /* set tex.height to 1/4 since 1 "macropixel" (dxt-block) has 4 real pixels. Needed
+-         so the kernel module reads the right amount of data. */
+-      tex.format = R200_TXFORMAT_I8; /* any 1-byte texel format */
+-      tex.pitch = (BLIT_WIDTH_BYTES / 64);
+-      tex.height = (imageHeight + 3) / 4;
+-      tex.width = (imageWidth + 3) / 4;
+-      switch (t->pp_txformat & R200_TXFORMAT_FORMAT_MASK) {
+-      case R200_TXFORMAT_DXT1:
+-           tex.width *= 8;
+-           break;
+-      case R200_TXFORMAT_DXT23:
+-      case R200_TXFORMAT_DXT45:
+-           tex.width *= 16;
+-           break;
+-      default:
+-          fprintf(stderr, "unknown compressed tex format in uploadSubImage\n");
+-      }
+-   }
+-
+-   LOCK_HARDWARE( rmesa );
+-   do {
+-      ret = drmCommandWriteRead( rmesa->dri.fd, DRM_RADEON_TEXTURE,
+-                                 &tex, sizeof(drm_radeon_texture_t) );
+-      if (ret) {
+-	 if (R200_DEBUG & DEBUG_IOCTL)
+-	    fprintf(stderr, "DRM_RADEON_TEXTURE:  again!\n");
+-	 usleep(1);
+-      }
+-   } while ( ret == -EAGAIN );
+-
+-   UNLOCK_HARDWARE( rmesa );
+-
+-   if ( ret ) {
+-      fprintf( stderr, "DRM_RADEON_TEXTURE: return = %d\n", ret );
+-      fprintf( stderr, "   offset=0x%08x\n",
+-	       offset );
+-      fprintf( stderr, "   image width=%d height=%d\n",
+-	       imageWidth, imageHeight );
+-      fprintf( stderr, "    blit width=%d height=%d data=%p\n",
+-	       t->image[face][hwlevel].width, t->image[face][hwlevel].height,
+-	       t->image[face][hwlevel].data );
+-      exit( 1 );
+-   }
+-}
+-
+-
+-/**
+- * Upload the texture images associated with texture \a t.  This might
+- * require the allocation of texture memory.
+- * 
+- * \param rmesa Context pointer
+- * \param t Texture to be uploaded
+- * \param face Cube map face to be uploaded.  Zero for non-cube maps.
+- */
+-
+-int r200UploadTexImages( r200ContextPtr rmesa, r200TexObjPtr t, GLuint face )
+-{
+-   const int numLevels = t->base.lastLevel - t->base.firstLevel + 1;
+-
+-   if ( R200_DEBUG & (DEBUG_TEXTURE|DEBUG_IOCTL) ) {
+-      fprintf( stderr, "%s( %p, %p ) sz=%d lvls=%d-%d\n", __FUNCTION__,
+-	       (void *)rmesa->glCtx, (void *)t->base.tObj, t->base.totalSize,
+-	       t->base.firstLevel, t->base.lastLevel );
+-   }
+-
+-   if ( !t || t->base.totalSize == 0 || t->image_override )
+-      return 0;
+-
+-   if (R200_DEBUG & DEBUG_SYNC) {
+-      fprintf(stderr, "%s: Syncing\n", __FUNCTION__ );
+-      r200Finish( rmesa->glCtx );
+-   }
+-
+-   LOCK_HARDWARE( rmesa );
+-
+-   if ( t->base.memBlock == NULL ) {
+-      int heap;
+-
+-      heap = driAllocateTexture( rmesa->texture_heaps, rmesa->nr_heaps,
+-				 (driTextureObject *) t );
+-      if ( heap == -1 ) {
+-	 UNLOCK_HARDWARE( rmesa );
+-	 return -1;
+-      }
+-
+-      /* Set the base offset of the texture image */
+-      t->bufAddr = rmesa->r200Screen->texOffset[heap] 
+-	   + t->base.memBlock->ofs;
+-      t->pp_txoffset = t->bufAddr;
+-       
+-      if (!(t->base.tObj->Image[0][0]->IsClientData)) {
+-	 /* hope it's safe to add that here... */
+-	 t->pp_txoffset |= t->tile_bits;
+-      }
+-
+-      /* Mark this texobj as dirty on all units:
+-       */
+-      t->dirty_state = TEX_ALL;
+-   }
+-
+-   /* Let the world know we've used this memory recently.
+-    */
+-   driUpdateTextureLRU( (driTextureObject *) t );
+-   UNLOCK_HARDWARE( rmesa );
+-
+-   /* Upload any images that are new */
+-   if (t->base.dirty_images[face]) {
+-      int i;
+-      for ( i = 0 ; i < numLevels ; i++ ) {
+-         if ( (t->base.dirty_images[face] & (1 << (i+t->base.firstLevel))) != 0 ) {
+-            uploadSubImage( rmesa, t, i, 0, 0, t->image[face][i].width,
+-			    t->image[face][i].height, face );
+-         }
+-      }
+-      t->base.dirty_images[face] = 0;
+-   }
+-
+-
+-   if (R200_DEBUG & DEBUG_SYNC) {
+-      fprintf(stderr, "%s: Syncing\n", __FUNCTION__ );
+-      r200Finish( rmesa->glCtx );
+-   }
+-
+-   return 0;
+-}
+diff --git a/src/mesa/drivers/dri/r200/r200_texstate.c b/src/mesa/drivers/dri/r200/r200_texstate.c
+index 3f9a2f4..6432068 100644
+--- a/src/mesa/drivers/dri/r200/r200_texstate.c
++++ b/src/mesa/drivers/dri/r200/r200_texstate.c
+@@ -40,6 +40,8 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ #include "main/texobj.h"
+ #include "main/enums.h"
+ 
++#include "radeon_common.h"
++#include "radeon_mipmap_tree.h"
+ #include "r200_context.h"
+ #include "r200_state.h"
+ #include "r200_ioctl.h"
+@@ -139,257 +141,6 @@ static const struct tx_table tx_table_le[] =
+ #undef _ALPHA
+ #undef _INVALID
+ 
+-/**
+- * This function computes the number of bytes of storage needed for
+- * the given texture object (all mipmap levels, all cube faces).
+- * The \c image[face][level].x/y/width/height parameters for upload/blitting
+- * are computed here.  \c pp_txfilter, \c pp_txformat, etc. will be set here
+- * too.
+- * 
+- * \param rmesa Context pointer
+- * \param tObj GL texture object whose images are to be posted to
+- *                 hardware state.
+- */
+-static void r200SetTexImages( r200ContextPtr rmesa,
+-			      struct gl_texture_object *tObj )
+-{
+-   r200TexObjPtr t = (r200TexObjPtr)tObj->DriverData;
+-   const struct gl_texture_image *baseImage = tObj->Image[0][tObj->BaseLevel];
+-   GLint curOffset, blitWidth;
+-   GLint i, texelBytes;
+-   GLint numLevels;
+-   GLint log2Width, log2Height, log2Depth;
+-
+-   /* Set the hardware texture format
+-    */
+-   if ( !t->image_override ) {
+-      if ( VALID_FORMAT( baseImage->TexFormat->MesaFormat ) ) {
+-	 const struct tx_table *table = _mesa_little_endian() ? tx_table_le :
+-								tx_table_be;
+-
+-         t->pp_txformat &= ~(R200_TXFORMAT_FORMAT_MASK |
+-                             R200_TXFORMAT_ALPHA_IN_MAP);
+-         t->pp_txfilter &= ~R200_YUV_TO_RGB;
+-
+-	 t->pp_txformat |= table[ baseImage->TexFormat->MesaFormat ].format;
+-	 t->pp_txfilter |= table[ baseImage->TexFormat->MesaFormat ].filter;
+-      }
+-      else {
+-         _mesa_problem(NULL, "unexpected texture format in %s", __FUNCTION__);
+-         return;
+-      }
+-   }
+-
+-   texelBytes = baseImage->TexFormat->TexelBytes;
+-
+-   /* Compute which mipmap levels we really want to send to the hardware.
+-    */
+-
+-   driCalculateTextureFirstLastLevel( (driTextureObject *) t );
+-   log2Width  = tObj->Image[0][t->base.firstLevel]->WidthLog2;
+-   log2Height = tObj->Image[0][t->base.firstLevel]->HeightLog2;
+-   log2Depth  = tObj->Image[0][t->base.firstLevel]->DepthLog2;
+-
+-   numLevels = t->base.lastLevel - t->base.firstLevel + 1;
+-
+-   assert(numLevels <= RADEON_MAX_TEXTURE_LEVELS);
+-
+-   /* Calculate mipmap offsets and dimensions for blitting (uploading)
+-    * The idea is that we lay out the mipmap levels within a block of
+-    * memory organized as a rectangle of width BLIT_WIDTH_BYTES.
+-    */
+-   curOffset = 0;
+-   blitWidth = BLIT_WIDTH_BYTES;
+-   t->tile_bits = 0;
+-
+-   /* figure out if this texture is suitable for tiling. */
+-   if (texelBytes) {
+-      if (rmesa->texmicrotile  && (tObj->Target != GL_TEXTURE_RECTANGLE_NV) &&
+-      /* texrect might be able to use micro tiling too in theory? */
+-	 (baseImage->Height > 1)) {
+-	 /* allow 32 (bytes) x 1 mip (which will use two times the space
+-	 the non-tiled version would use) max if base texture is large enough */
+-	 if ((numLevels == 1) ||
+-	   (((baseImage->Width * texelBytes / baseImage->Height) <= 32) &&
+-	       (baseImage->Width * texelBytes > 64)) ||
+-	    ((baseImage->Width * texelBytes / baseImage->Height) <= 16)) {
+-	    t->tile_bits |= R200_TXO_MICRO_TILE;
+-	 }
+-      }
+-      if (tObj->Target != GL_TEXTURE_RECTANGLE_NV) {
+-	 /* we can set macro tiling even for small textures, they will be untiled anyway */
+-	 t->tile_bits |= R200_TXO_MACRO_TILE;
+-      }
+-   }
+-
+-   for (i = 0; i < numLevels; i++) {
+-      const struct gl_texture_image *texImage;
+-      GLuint size;
+-
+-      texImage = tObj->Image[0][i + t->base.firstLevel];
+-      if ( !texImage )
+-	 break;
+-
+-      /* find image size in bytes */
+-      if (texImage->IsCompressed) {
+-      /* need to calculate the size AFTER padding even though the texture is
+-         submitted without padding.
+-         Only handle pot textures currently - don't know if npot is even possible,
+-         size calculation would certainly need (trivial) adjustments.
+-         Align (and later pad) to 32byte, not sure what that 64byte blit width is
+-         good for? */
+-         if ((t->pp_txformat & R200_TXFORMAT_FORMAT_MASK) == R200_TXFORMAT_DXT1) {
+-            /* RGB_DXT1/RGBA_DXT1, 8 bytes per block */
+-            if ((texImage->Width + 3) < 8) /* width one block */
+-               size = texImage->CompressedSize * 4;
+-            else if ((texImage->Width + 3) < 16)
+-               size = texImage->CompressedSize * 2;
+-            else size = texImage->CompressedSize;
+-         }
+-         else /* DXT3/5, 16 bytes per block */
+-            if ((texImage->Width + 3) < 8)
+-               size = texImage->CompressedSize * 2;
+-            else size = texImage->CompressedSize;
+-      }
+-      else if (tObj->Target == GL_TEXTURE_RECTANGLE_NV) {
+-	 size = ((texImage->Width * texelBytes + 63) & ~63) * texImage->Height;
+-      }
+-      else if (t->tile_bits & R200_TXO_MICRO_TILE) {
+-	 /* tile pattern is 16 bytes x2. mipmaps stay 32 byte aligned,
+-	    though the actual offset may be different (if texture is less than
+-	    32 bytes width) to the untiled case */
+-	 int w = (texImage->Width * texelBytes * 2 + 31) & ~31;
+-	 size = (w * ((texImage->Height + 1) / 2)) * texImage->Depth;
+-	 blitWidth = MAX2(texImage->Width, 64 / texelBytes);
+-      }
+-      else {
+-	 int w = (texImage->Width * texelBytes + 31) & ~31;
+-	 size = w * texImage->Height * texImage->Depth;
+-	 blitWidth = MAX2(texImage->Width, 64 / texelBytes);
+-      }
+-      assert(size > 0);
+-
+-      /* Align to 32-byte offset.  It is faster to do this unconditionally
+-       * (no branch penalty).
+-       */
+-
+-      curOffset = (curOffset + 0x1f) & ~0x1f;
+-
+-      if (texelBytes) {
+-	 t->image[0][i].x = curOffset; /* fix x and y coords up later together with offset */
+-	 t->image[0][i].y = 0;
+-	 t->image[0][i].width = MIN2(size / texelBytes, blitWidth);
+-	 t->image[0][i].height = (size / texelBytes) / t->image[0][i].width;
+-      }
+-      else {
+-         t->image[0][i].x = curOffset % BLIT_WIDTH_BYTES;
+-         t->image[0][i].y = curOffset / BLIT_WIDTH_BYTES;
+-         t->image[0][i].width  = MIN2(size, BLIT_WIDTH_BYTES);
+-         t->image[0][i].height = size / t->image[0][i].width;     
+-      }
+-
+-#if 0
+-      /* for debugging only and only  applicable to non-rectangle targets */
+-      assert(size % t->image[0][i].width == 0);
+-      assert(t->image[0][i].x == 0
+-             || (size < BLIT_WIDTH_BYTES && t->image[0][i].height == 1));
+-#endif
+-
+-      if (0)
+-         fprintf(stderr,
+-                 "level %d: %dx%d x=%d y=%d w=%d h=%d size=%d at %d\n",
+-                 i, texImage->Width, texImage->Height,
+-                 t->image[0][i].x, t->image[0][i].y,
+-                 t->image[0][i].width, t->image[0][i].height, size, curOffset);
+-
+-      curOffset += size;
+-
+-   }
+-
+-   /* Align the total size of texture memory block.
+-    */
+-   t->base.totalSize = (curOffset + RADEON_OFFSET_MASK) & ~RADEON_OFFSET_MASK;
+-
+-   /* Setup remaining cube face blits, if needed */
+-   if (tObj->Target == GL_TEXTURE_CUBE_MAP) {
+-      const GLuint faceSize = t->base.totalSize;
+-      GLuint face;
+-      /* reuse face 0 x/y/width/height - just update the offset when uploading */
+-      for (face = 1; face < 6; face++) {
+-         for (i = 0; i < numLevels; i++) {
+-            t->image[face][i].x =  t->image[0][i].x;
+-            t->image[face][i].y =  t->image[0][i].y;
+-            t->image[face][i].width  = t->image[0][i].width;
+-            t->image[face][i].height = t->image[0][i].height;
+-         }
+-      }
+-      t->base.totalSize = 6 * faceSize; /* total texmem needed */
+-   }
+-
+-
+-   /* Hardware state:
+-    */
+-   t->pp_txfilter &= ~R200_MAX_MIP_LEVEL_MASK;
+-   t->pp_txfilter |= (numLevels - 1) << R200_MAX_MIP_LEVEL_SHIFT;
+-
+-   t->pp_txformat &= ~(R200_TXFORMAT_WIDTH_MASK |
+-		       R200_TXFORMAT_HEIGHT_MASK |
+-                       R200_TXFORMAT_CUBIC_MAP_ENABLE |
+-                       R200_TXFORMAT_F5_WIDTH_MASK |
+-                       R200_TXFORMAT_F5_HEIGHT_MASK);
+-   t->pp_txformat |= ((log2Width << R200_TXFORMAT_WIDTH_SHIFT) |
+-		      (log2Height << R200_TXFORMAT_HEIGHT_SHIFT));
+-
+-   t->pp_txformat_x &= ~(R200_DEPTH_LOG2_MASK | R200_TEXCOORD_MASK);
+-   if (tObj->Target == GL_TEXTURE_3D) {
+-      t->pp_txformat_x |= (log2Depth << R200_DEPTH_LOG2_SHIFT);
+-      t->pp_txformat_x |= R200_TEXCOORD_VOLUME;
+-   }
+-   else if (tObj->Target == GL_TEXTURE_CUBE_MAP) {
+-      ASSERT(log2Width == log2Height);
+-      t->pp_txformat |= ((log2Width << R200_TXFORMAT_F5_WIDTH_SHIFT) |
+-                         (log2Height << R200_TXFORMAT_F5_HEIGHT_SHIFT) |
+-/* don't think we need this bit, if it exists at all - fglrx does not set it */
+-                         (R200_TXFORMAT_CUBIC_MAP_ENABLE));
+-      t->pp_txformat_x |= R200_TEXCOORD_CUBIC_ENV;
+-      t->pp_cubic_faces = ((log2Width << R200_FACE_WIDTH_1_SHIFT) |
+-                           (log2Height << R200_FACE_HEIGHT_1_SHIFT) |
+-                           (log2Width << R200_FACE_WIDTH_2_SHIFT) |
+-                           (log2Height << R200_FACE_HEIGHT_2_SHIFT) |
+-                           (log2Width << R200_FACE_WIDTH_3_SHIFT) |
+-                           (log2Height << R200_FACE_HEIGHT_3_SHIFT) |
+-                           (log2Width << R200_FACE_WIDTH_4_SHIFT) |
+-                           (log2Height << R200_FACE_HEIGHT_4_SHIFT));
+-   }
+-   else {
+-      /* If we don't in fact send enough texture coordinates, q will be 1,
+-       * making TEXCOORD_PROJ act like TEXCOORD_NONPROJ (Right?)
+-       */
+-      t->pp_txformat_x |= R200_TEXCOORD_PROJ;
+-   }
+-
+-   t->pp_txsize = (((tObj->Image[0][t->base.firstLevel]->Width - 1) << 0) |
+-                   ((tObj->Image[0][t->base.firstLevel]->Height - 1) << 16));
+-
+-   /* Only need to round to nearest 32 for textures, but the blitter
+-    * requires 64-byte aligned pitches, and we may/may not need the
+-    * blitter.   NPOT only!
+-    */
+-   if ( !t->image_override ) {
+-      if (baseImage->IsCompressed)
+-         t->pp_txpitch = (tObj->Image[0][t->base.firstLevel]->Width + 63) & ~(63);
+-      else
+-         t->pp_txpitch = ((tObj->Image[0][t->base.firstLevel]->Width * texelBytes) + 63) & ~(63);
+-      t->pp_txpitch -= 32;
+-   }
+-
+-   t->dirty_state = TEX_ALL;
+-
+-   /* FYI: r200UploadTexImages( rmesa, t ) used to be called here */
+-}
+-
+-
+-
+ /* ================================================================
+  * Texture combine functions
+  */
+@@ -981,20 +732,19 @@ void r200SetTexOffset(__DRIcontext * pDRICtx, GLint texname,
+ {
+ 	r200ContextPtr rmesa = pDRICtx->driverPrivate;
+ 	struct gl_texture_object *tObj =
+-	    _mesa_lookup_texture(rmesa->glCtx, texname);
+-	r200TexObjPtr t;
++	    _mesa_lookup_texture(rmesa->radeon.glCtx, texname);
++	radeonTexObjPtr t = radeon_tex_obj(tObj);
+ 
+ 	if (!tObj)
+ 		return;
+ 
+-	t = (r200TexObjPtr) tObj->DriverData;
+-
+ 	t->image_override = GL_TRUE;
+ 
+ 	if (!offset)
+ 		return;
+ 
+-	t->pp_txoffset = offset;
++	t->bo = NULL;
++	t->override_offset = offset;
+ 	t->pp_txpitch = pitch - 32;
+ 
+ 	switch (depth) {
+@@ -1207,12 +957,41 @@ static GLboolean r200UpdateAllTexEnv( GLcontext *ctx )
+                                 R200_VOLUME_FILTER_MASK)
+ 
+ 
++static void disable_tex_obj_state( r200ContextPtr rmesa, 
++				   int unit )
 +{
-+    struct bo_legacy *bo_legacy;
-+    static int pgsize;
++   
++   R200_STATECHANGE( rmesa, vtx );
++   rmesa->hw.vtx.cmd[VTX_TCL_OUTPUT_VTXFMT_1] &= ~(7 << (unit * 3));
 +
-+    if (pgsize == 0)
-+        pgsize = getpagesize() - 1;
++   if (rmesa->radeon.TclFallback & (R200_TCL_FALLBACK_TEXGEN_0<<unit)) {
++      TCL_FALLBACK( rmesa->radeon.glCtx, (R200_TCL_FALLBACK_TEXGEN_0<<unit), GL_FALSE);
++   }
 +
-+    size = (size + pgsize) & ~pgsize;
++   /* Actually want to keep all units less than max active texture
++    * enabled, right?  Fix this for >2 texunits.
++    */
 +
-+    bo_legacy = (struct bo_legacy*)calloc(1, sizeof(struct bo_legacy));
-+    if (bo_legacy == NULL) {
-+        return NULL;
-+    }
-+    bo_legacy->base.bom = (struct radeon_bo_manager*)boml;
-+    bo_legacy->base.handle = 0;
-+    bo_legacy->base.size = size;
-+    bo_legacy->base.alignment = alignment;
-+    bo_legacy->base.domains = domains;
-+    bo_legacy->base.flags = flags;
-+    bo_legacy->base.ptr = NULL;
-+    bo_legacy->map_count = 0;
-+    bo_legacy->next = NULL;
-+    bo_legacy->prev = NULL;
-+    bo_legacy->pnext = NULL;
-+    bo_legacy->pprev = NULL;
-+    bo_legacy->next = boml->bos.next;
-+    bo_legacy->prev = &boml->bos;
-+    boml->bos.next = bo_legacy;
-+    if (bo_legacy->next) {
-+        bo_legacy->next->prev = bo_legacy;
-+    }
-+    return bo_legacy;
-+}
++   {
++      GLuint tmp = rmesa->TexGenEnabled;
 +
-+static int bo_dma_alloc(struct radeon_bo *bo)
-+{
-+    struct bo_manager_legacy *boml = (struct bo_manager_legacy *)bo->bom;
-+    struct bo_legacy *bo_legacy = (struct bo_legacy*)bo;
-+    drm_radeon_mem_alloc_t alloc;
-+    unsigned size;
-+    int base_offset;
-+    int r;
++      rmesa->TexGenEnabled &= ~(R200_TEXGEN_TEXMAT_0_ENABLE<<unit);
++      rmesa->TexGenEnabled &= ~(R200_TEXMAT_0_ENABLE<<unit);
++      rmesa->TexGenNeedNormals[unit] = GL_FALSE;
++      rmesa->TexGenCompSel &= ~(R200_OUTPUT_TEX_0 << unit);
 +
-+    /* align size on 4Kb */
-+    size = (((4 * 1024) - 1) + bo->size) & ~((4 * 1024) - 1);
-+    alloc.region = RADEON_MEM_REGION_GART;
-+    alloc.alignment = bo_legacy->base.alignment;
-+    alloc.size = size;
-+    alloc.region_offset = &base_offset;
-+    r = drmCommandWriteRead(bo->bom->fd,
-+                            DRM_RADEON_ALLOC,
-+                            &alloc,
-+                            sizeof(alloc));
-+    if (r) {
-+        /* ptr is set to NULL if dma allocation failed */
-+        bo_legacy->ptr = NULL;
-+        return r;
++      if (tmp != rmesa->TexGenEnabled) {
++	 rmesa->recheck_texgen[unit] = GL_TRUE;
++	 rmesa->radeon.NewGLState |= _NEW_TEXTURE_MATRIX;
++      }
++   }
++}
+ static void import_tex_obj_state( r200ContextPtr rmesa,
+ 				  int unit,
+-				  r200TexObjPtr texobj )
++				  radeonTexObjPtr texobj )
+ {
+ /* do not use RADEON_DB_STATE to avoid stale texture caches */
+-   int *cmd = &rmesa->hw.tex[unit].cmd[TEX_CMD_0];
++   GLuint *cmd = &rmesa->hw.tex[unit].cmd[TEX_CMD_0];
+ 
+    R200_STATECHANGE( rmesa, tex[unit] );
+ 
+@@ -1225,36 +1004,21 @@ static void import_tex_obj_state( r200ContextPtr rmesa,
+    cmd[TEX_PP_TXSIZE] = texobj->pp_txsize; /* NPOT only! */
+    cmd[TEX_PP_TXPITCH] = texobj->pp_txpitch; /* NPOT only! */
+    cmd[TEX_PP_BORDER_COLOR] = texobj->pp_border_color;
+-   if (rmesa->r200Screen->drmSupportsFragShader) {
+-      cmd[TEX_PP_TXOFFSET_NEWDRM] = texobj->pp_txoffset;
+-   }
+-   else {
+-      cmd[TEX_PP_TXOFFSET_OLDDRM] = texobj->pp_txoffset;
+-   }
+ 
+-   if (texobj->base.tObj->Target == GL_TEXTURE_CUBE_MAP) {
+-      int *cube_cmd = &rmesa->hw.cube[unit].cmd[CUBE_CMD_0];
+-      GLuint bytesPerFace = texobj->base.totalSize / 6;
+-      ASSERT(texobj->base.totalSize % 6 == 0);
++   if (texobj->base.Target == GL_TEXTURE_CUBE_MAP) {
++      GLuint *cube_cmd = &rmesa->hw.cube[unit].cmd[CUBE_CMD_0];
+ 
+       R200_STATECHANGE( rmesa, cube[unit] );
+       cube_cmd[CUBE_PP_CUBIC_FACES] = texobj->pp_cubic_faces;
+-      if (rmesa->r200Screen->drmSupportsFragShader) {
++      if (rmesa->radeon.radeonScreen->drmSupportsFragShader) {
+ 	 /* that value is submitted twice. could change cube atom
+ 	    to not include that command when new drm is used */
+ 	 cmd[TEX_PP_CUBIC_FACES] = texobj->pp_cubic_faces;
+       }
+-      cube_cmd[CUBE_PP_CUBIC_OFFSET_F1] = texobj->pp_txoffset + 1 * bytesPerFace;
+-      cube_cmd[CUBE_PP_CUBIC_OFFSET_F2] = texobj->pp_txoffset + 2 * bytesPerFace;
+-      cube_cmd[CUBE_PP_CUBIC_OFFSET_F3] = texobj->pp_txoffset + 3 * bytesPerFace;
+-      cube_cmd[CUBE_PP_CUBIC_OFFSET_F4] = texobj->pp_txoffset + 4 * bytesPerFace;
+-      cube_cmd[CUBE_PP_CUBIC_OFFSET_F5] = texobj->pp_txoffset + 5 * bytesPerFace;
+    }
+ 
+-   texobj->dirty_state &= ~(1<<unit);
+ }
+ 
+-
+ static void set_texgen_matrix( r200ContextPtr rmesa, 
+ 			       GLuint unit,
+ 			       const GLfloat *s_plane,
+@@ -1377,7 +1141,6 @@ static GLboolean r200_validate_texgen( GLcontext *ctx, GLuint unit )
+    } else {
+       tgcm |= R200_TEXGEN_COMP_T << (unit * 4);
+    }
+-
+    if (texUnit->TexGenEnabled & R_BIT) {
+       if (texUnit->GenModeR != mode)
+ 	 mixed_fallback = GL_TRUE;
+@@ -1513,52 +1276,6 @@ static GLboolean r200_validate_texgen( GLcontext *ctx, GLuint unit )
+    return GL_TRUE;
+ }
+ 
+-
+-static void disable_tex( GLcontext *ctx, int unit )
+-{
+-   r200ContextPtr rmesa = R200_CONTEXT(ctx);
+-
+-   if (rmesa->hw.ctx.cmd[CTX_PP_CNTL] & (R200_TEX_0_ENABLE<<unit)) {
+-      /* Texture unit disabled */
+-      if ( rmesa->state.texture.unit[unit].texobj != NULL ) {
+-	 /* The old texture is no longer bound to this texture unit.
+-	  * Mark it as such.
+-	  */
+-
+-	 rmesa->state.texture.unit[unit].texobj->base.bound &= ~(1UL << unit);
+-	 rmesa->state.texture.unit[unit].texobj = NULL;
+-      }
+-
+-      R200_STATECHANGE( rmesa, ctx );
+-      rmesa->hw.ctx.cmd[CTX_PP_CNTL] &= ~(R200_TEX_0_ENABLE << unit);
+-	 
+-      R200_STATECHANGE( rmesa, vtx );
+-      rmesa->hw.vtx.cmd[VTX_TCL_OUTPUT_VTXFMT_1] &= ~(7 << (unit * 3));
+-	 
+-      if (rmesa->TclFallback & (R200_TCL_FALLBACK_TEXGEN_0<<unit)) {
+-	 TCL_FALLBACK( ctx, (R200_TCL_FALLBACK_TEXGEN_0<<unit), GL_FALSE);
+-      }
+-
+-      /* Actually want to keep all units less than max active texture
+-       * enabled, right?  Fix this for >2 texunits.
+-       */
+-
+-      {
+-	 GLuint tmp = rmesa->TexGenEnabled;
+-
+-	 rmesa->TexGenEnabled &= ~(R200_TEXGEN_TEXMAT_0_ENABLE<<unit);
+-	 rmesa->TexGenEnabled &= ~(R200_TEXMAT_0_ENABLE<<unit);
+-	 rmesa->TexGenNeedNormals[unit] = GL_FALSE;
+-	 rmesa->TexGenCompSel &= ~(R200_OUTPUT_TEX_0 << unit);
+-
+-	 if (tmp != rmesa->TexGenEnabled) {
+-	    rmesa->recheck_texgen[unit] = GL_TRUE;
+-	    rmesa->NewGLState |= _NEW_TEXTURE_MATRIX;
+-	 }
+-      }
+-   }
+-}
+-
+ void set_re_cntl_d3d( GLcontext *ctx, int unit, GLboolean use_d3d )
+ {
+    r200ContextPtr rmesa = R200_CONTEXT(ctx);
+@@ -1575,237 +1292,165 @@ void set_re_cntl_d3d( GLcontext *ctx, int unit, GLboolean use_d3d )
+    }
+ }
+ 
+-static GLboolean enable_tex_2d( GLcontext *ctx, int unit )
+-{
+-   r200ContextPtr rmesa = R200_CONTEXT(ctx);
+-   struct gl_texture_unit *texUnit = &ctx->Texture.Unit[unit];
+-   struct gl_texture_object *tObj = texUnit->_Current;
+-   r200TexObjPtr t = (r200TexObjPtr) tObj->DriverData;
+-
+-   /* Need to load the 2d images associated with this unit.
+-    */
+-   if (t->pp_txformat & R200_TXFORMAT_NON_POWER2) {
+-      t->pp_txformat &= ~R200_TXFORMAT_NON_POWER2;
+-      t->base.dirty_images[0] = ~0;
+-   }
+-
+-   ASSERT(tObj->Target == GL_TEXTURE_2D || tObj->Target == GL_TEXTURE_1D);
+-
+-   if ( t->base.dirty_images[0] ) {
+-      R200_FIREVERTICES( rmesa );
+-      r200SetTexImages( rmesa, tObj );
+-      r200UploadTexImages( rmesa, (r200TexObjPtr) tObj->DriverData, 0 );
+-      if ( !t->base.memBlock && !t->image_override ) 
+-	 return GL_FALSE;
+-   }
+-
+-   set_re_cntl_d3d( ctx, unit, GL_FALSE );
+-
+-   return GL_TRUE;
+-}
+-
+-#if ENABLE_HW_3D_TEXTURE
+-static GLboolean enable_tex_3d( GLcontext *ctx, int unit )
++/**
++ * Compute the cached hardware register values for the given texture object.
++ *
++ * \param rmesa Context pointer
++ * \param t the r300 texture object
++ */
++static void setup_hardware_state(r200ContextPtr rmesa, radeonTexObj *t)
+ {
+-   r200ContextPtr rmesa = R200_CONTEXT(ctx);
+-   struct gl_texture_unit *texUnit = &ctx->Texture.Unit[unit];
+-   struct gl_texture_object *tObj = texUnit->_Current;
+-   r200TexObjPtr t = (r200TexObjPtr) tObj->DriverData;
+-
+-   /* Need to load the 3d images associated with this unit.
+-    */
+-   if (t->pp_txformat & R200_TXFORMAT_NON_POWER2) {
+-      t->pp_txformat &= ~R200_TXFORMAT_NON_POWER2;
+-      t->base.dirty_images[0] = ~0;
++   const struct gl_texture_image *firstImage =
++      t->base.Image[0][t->mt->firstLevel];
++   GLint log2Width, log2Height, log2Depth, texelBytes;
++   
++   log2Width  = firstImage->WidthLog2;
++   log2Height = firstImage->HeightLog2;
++   log2Depth  = firstImage->DepthLog2;
++   texelBytes = firstImage->TexFormat->TexelBytes;
++
++
++   if (!t->image_override) {
++      if (VALID_FORMAT(firstImage->TexFormat->MesaFormat)) {
++	 const struct tx_table *table = _mesa_little_endian() ? tx_table_le :
++	    tx_table_be;
++	 
++	 t->pp_txformat &= ~(R200_TXFORMAT_FORMAT_MASK |
++			     R200_TXFORMAT_ALPHA_IN_MAP);
++	 t->pp_txfilter &= ~R200_YUV_TO_RGB;
++	 
++	 t->pp_txformat |= table[ firstImage->TexFormat->MesaFormat ].format;
++	 t->pp_txfilter |= table[ firstImage->TexFormat->MesaFormat ].filter;
++      } else {
++	 _mesa_problem(NULL, "unexpected texture format in %s",
++		       __FUNCTION__);
++	 return;
++      }
+    }
++   
++   t->pp_txfilter &= ~R200_MAX_MIP_LEVEL_MASK;
++   t->pp_txfilter |= (t->mt->lastLevel - t->mt->firstLevel) << R200_MAX_MIP_LEVEL_SHIFT;
++	
++   t->pp_txformat &= ~(R200_TXFORMAT_WIDTH_MASK |
++		       R200_TXFORMAT_HEIGHT_MASK |
++		       R200_TXFORMAT_CUBIC_MAP_ENABLE |
++		       R200_TXFORMAT_F5_WIDTH_MASK |
++		       R200_TXFORMAT_F5_HEIGHT_MASK);
++   t->pp_txformat |= ((log2Width << R200_TXFORMAT_WIDTH_SHIFT) |
++		      (log2Height << R200_TXFORMAT_HEIGHT_SHIFT));
++   
++   t->tile_bits = 0;
++   
++   t->pp_txformat_x &= ~(R200_DEPTH_LOG2_MASK | R200_TEXCOORD_MASK);
++   if (t->base.Target == GL_TEXTURE_3D) {
++      t->pp_txformat_x |= (log2Depth << R200_DEPTH_LOG2_SHIFT);
++      t->pp_txformat_x |= R200_TEXCOORD_VOLUME;
+ 
+-   ASSERT(tObj->Target == GL_TEXTURE_3D);
+-
+-   /* R100 & R200 do not support mipmaps for 3D textures.
+-    */
+-   if ( (tObj->MinFilter != GL_NEAREST) && (tObj->MinFilter != GL_LINEAR) ) {
+-      return GL_FALSE;
+    }
+-
+-   if ( t->base.dirty_images[0] ) {
+-      R200_FIREVERTICES( rmesa );
+-      r200SetTexImages( rmesa, tObj );
+-      r200UploadTexImages( rmesa, (r200TexObjPtr) tObj->DriverData, 0 );
+-      if ( !t->base.memBlock ) 
+-	 return GL_FALSE;
++   else if (t->base.Target == GL_TEXTURE_CUBE_MAP) {
++      ASSERT(log2Width == log2Height);
++      t->pp_txformat |= ((log2Width << R200_TXFORMAT_F5_WIDTH_SHIFT) |
++			 (log2Height << R200_TXFORMAT_F5_HEIGHT_SHIFT) |
++			 /* don't think we need this bit, if it exists at all - fglrx does not set it */
++			 (R200_TXFORMAT_CUBIC_MAP_ENABLE));
++      t->pp_txformat_x |= R200_TEXCOORD_CUBIC_ENV;
++      t->pp_cubic_faces = ((log2Width << R200_FACE_WIDTH_1_SHIFT) |
++                           (log2Height << R200_FACE_HEIGHT_1_SHIFT) |
++                           (log2Width << R200_FACE_WIDTH_2_SHIFT) |
++                           (log2Height << R200_FACE_HEIGHT_2_SHIFT) |
++                           (log2Width << R200_FACE_WIDTH_3_SHIFT) |
++                           (log2Height << R200_FACE_HEIGHT_3_SHIFT) |
++                           (log2Width << R200_FACE_WIDTH_4_SHIFT) |
++                           (log2Height << R200_FACE_HEIGHT_4_SHIFT));
+    }
+-
+-   set_re_cntl_d3d( ctx, unit, GL_TRUE );
+-
+-   return GL_TRUE;
+-}
+-#endif
+-
+-static GLboolean enable_tex_cube( GLcontext *ctx, int unit )
+-{
+-   r200ContextPtr rmesa = R200_CONTEXT(ctx);
+-   struct gl_texture_unit *texUnit = &ctx->Texture.Unit[unit];
+-   struct gl_texture_object *tObj = texUnit->_Current;
+-   r200TexObjPtr t = (r200TexObjPtr) tObj->DriverData;
+-   GLuint face;
+-
+-   /* Need to load the 2d images associated with this unit.
+-    */
+-   if (t->pp_txformat & R200_TXFORMAT_NON_POWER2) {
+-      t->pp_txformat &= ~R200_TXFORMAT_NON_POWER2;
+-      for (face = 0; face < 6; face++)
+-         t->base.dirty_images[face] = ~0;
++   else {
++      /* If we don't in fact send enough texture coordinates, q will be 1,
++       * making TEXCOORD_PROJ act like TEXCOORD_NONPROJ (Right?)
++       */
++      t->pp_txformat_x |= R200_TEXCOORD_PROJ;
+    }
+ 
+-   ASSERT(tObj->Target == GL_TEXTURE_CUBE_MAP);
+-
+-   if ( t->base.dirty_images[0] || t->base.dirty_images[1] ||
+-        t->base.dirty_images[2] || t->base.dirty_images[3] ||
+-        t->base.dirty_images[4] || t->base.dirty_images[5] ) {
+-      /* flush */
+-      R200_FIREVERTICES( rmesa );
+-      /* layout memory space, once for all faces */
+-      r200SetTexImages( rmesa, tObj );
+-   }
++   t->pp_txsize = (((firstImage->Width - 1) << R200_PP_TX_WIDTHMASK_SHIFT)
++		   | ((firstImage->Height - 1) << R200_PP_TX_HEIGHTMASK_SHIFT));
+ 
+-   /* upload (per face) */
+-   for (face = 0; face < 6; face++) {
+-      if (t->base.dirty_images[face]) {
+-         r200UploadTexImages( rmesa, (r200TexObjPtr) tObj->DriverData, face );
+-      }
+-   }
+-      
+-   if ( !t->base.memBlock ) {
+-      /* texmem alloc failed, use s/w fallback */
+-      return GL_FALSE;
++   if ( !t->image_override ) {
++      if (firstImage->IsCompressed)
++         t->pp_txpitch = (firstImage->Width + 63) & ~(63);
++      else
++         t->pp_txpitch = ((firstImage->Width * texelBytes) + 63) & ~(63);
++      t->pp_txpitch -= 32;
+    }
+ 
+-   set_re_cntl_d3d( ctx, unit, GL_TRUE );
+-
+-   return GL_TRUE;
+-}
+-
+-static GLboolean enable_tex_rect( GLcontext *ctx, int unit )
+-{
+-   r200ContextPtr rmesa = R200_CONTEXT(ctx);
+-   struct gl_texture_unit *texUnit = &ctx->Texture.Unit[unit];
+-   struct gl_texture_object *tObj = texUnit->_Current;
+-   r200TexObjPtr t = (r200TexObjPtr) tObj->DriverData;
+-
+-   if (!(t->pp_txformat & R200_TXFORMAT_NON_POWER2)) {
++   if (t->base.Target == GL_TEXTURE_RECTANGLE_NV) {
+       t->pp_txformat |= R200_TXFORMAT_NON_POWER2;
+-      t->base.dirty_images[0] = ~0;
+    }
+ 
+-   ASSERT(tObj->Target == GL_TEXTURE_RECTANGLE_NV);
+-
+-   if ( t->base.dirty_images[0] ) {
+-      R200_FIREVERTICES( rmesa );
+-      r200SetTexImages( rmesa, tObj );
+-      r200UploadTexImages( rmesa, (r200TexObjPtr) tObj->DriverData, 0 );
+-      if ( !t->base.memBlock &&
+-           !t->image_override &&
+-           !rmesa->prefer_gart_client_texturing ) 
+-	 return GL_FALSE;
+-   }
+-
+-   set_re_cntl_d3d( ctx, unit, GL_FALSE );
+-
+-   return GL_TRUE;
+ }
+ 
+-
+-static GLboolean update_tex_common( GLcontext *ctx, int unit )
++static GLboolean r200_validate_texture(GLcontext *ctx, struct gl_texture_object *texObj, int unit)
+ {
+    r200ContextPtr rmesa = R200_CONTEXT(ctx);
+-   struct gl_texture_unit *texUnit = &ctx->Texture.Unit[unit];
+-   struct gl_texture_object *tObj = texUnit->_Current;
+-   r200TexObjPtr t = (r200TexObjPtr) tObj->DriverData;
+-
+-   /* Fallback if there's a texture border */
+-   if ( tObj->Image[0][tObj->BaseLevel]->Border > 0 )
+-       return GL_FALSE;
+-
+-   /* Update state if this is a different texture object to last
+-    * time.
+-    */
+-   if ( rmesa->state.texture.unit[unit].texobj != t ) {
+-      if ( rmesa->state.texture.unit[unit].texobj != NULL ) {
+-	 /* The old texture is no longer bound to this texture unit.
+-	  * Mark it as such.
+-	  */
+-
+-	 rmesa->state.texture.unit[unit].texobj->base.bound &= 
+-	     ~(1UL << unit);
+-      }
++   radeonTexObj *t = radeon_tex_obj(texObj);
+ 
+-      rmesa->state.texture.unit[unit].texobj = t;
+-      t->base.bound |= (1UL << unit);
+-      t->dirty_state |= 1<<unit;
+-      driUpdateTextureLRU( (driTextureObject *) t ); /* XXX: should be locked! */
+-   }
+-
+-
+-   /* Newly enabled?
+-    */
+-   if ( 1|| !(rmesa->hw.ctx.cmd[CTX_PP_CNTL] & (R200_TEX_0_ENABLE<<unit))) {
+-      R200_STATECHANGE( rmesa, ctx );
+-      rmesa->hw.ctx.cmd[CTX_PP_CNTL] |= R200_TEX_0_ENABLE << unit;
++   if (!radeon_validate_texture_miptree(ctx, texObj))
++      return GL_FALSE;
+ 
+-      R200_STATECHANGE( rmesa, vtx );
+-      rmesa->hw.vtx.cmd[VTX_TCL_OUTPUT_VTXFMT_1] &= ~(7 << (unit * 3));
+-      rmesa->hw.vtx.cmd[VTX_TCL_OUTPUT_VTXFMT_1] |= 4 << (unit * 3);
++   r200_validate_texgen(ctx, unit);
++   /* Configure the hardware registers (more precisely, the cached version
++    * of the hardware registers). */
++   setup_hardware_state(rmesa, t);
++
++   if (texObj->Target == GL_TEXTURE_RECTANGLE_NV ||
++       texObj->Target == GL_TEXTURE_2D ||
++       texObj->Target == GL_TEXTURE_1D)
++      set_re_cntl_d3d( ctx, unit, GL_FALSE );
++   else
++      set_re_cntl_d3d( ctx, unit, GL_TRUE );
++   R200_STATECHANGE( rmesa, ctx );
++   rmesa->hw.ctx.cmd[CTX_PP_CNTL] |= R200_TEX_0_ENABLE << unit;
++   
++   R200_STATECHANGE( rmesa, vtx );
++   rmesa->hw.vtx.cmd[VTX_TCL_OUTPUT_VTXFMT_1] &= ~(7 << (unit * 3));
++   rmesa->hw.vtx.cmd[VTX_TCL_OUTPUT_VTXFMT_1] |= 4 << (unit * 3);
+ 
+-      rmesa->recheck_texgen[unit] = GL_TRUE;
+-   }
+-
+-   if (t->dirty_state & (1<<unit)) {
+-      import_tex_obj_state( rmesa, unit, t );
+-   }
++   rmesa->recheck_texgen[unit] = GL_TRUE;
++   import_tex_obj_state( rmesa, unit, t );
+ 
+    if (rmesa->recheck_texgen[unit]) {
+       GLboolean fallback = !r200_validate_texgen( ctx, unit );
+       TCL_FALLBACK( ctx, (R200_TCL_FALLBACK_TEXGEN_0<<unit), fallback);
+       rmesa->recheck_texgen[unit] = 0;
+-      rmesa->NewGLState |= _NEW_TEXTURE_MATRIX;
++      rmesa->radeon.NewGLState |= _NEW_TEXTURE_MATRIX;
+    }
+ 
+-   FALLBACK( rmesa, R200_FALLBACK_BORDER_MODE, t->border_fallback );
+-   return !t->border_fallback;
+-}
++   t->validated = GL_TRUE;
+ 
++   FALLBACK( rmesa, RADEON_FALLBACK_BORDER_MODE, t->border_fallback );
+ 
++   return !t->border_fallback;
++}
+ 
+-static GLboolean r200UpdateTextureUnit( GLcontext *ctx, int unit )
++static GLboolean r200UpdateTextureUnit(GLcontext *ctx, int unit)
+ {
+    r200ContextPtr rmesa = R200_CONTEXT(ctx);
+    GLuint unitneeded = rmesa->state.texture.unit[unit].unitneeded;
+ 
+-   if ( unitneeded & (TEXTURE_RECT_BIT) ) {
+-      return (enable_tex_rect( ctx, unit ) &&
+-	      update_tex_common( ctx, unit ));
+-   }
+-   else if ( unitneeded & (TEXTURE_1D_BIT | TEXTURE_2D_BIT) ) {
+-      return (enable_tex_2d( ctx, unit ) &&
+-	      update_tex_common( ctx, unit ));
+-   }
+-#if ENABLE_HW_3D_TEXTURE
+-   else if ( unitneeded & (TEXTURE_3D_BIT) ) {
+-      return (enable_tex_3d( ctx, unit ) &&
+-	      update_tex_common( ctx, unit ));
+-   }
+-#endif
+-   else if ( unitneeded & (TEXTURE_CUBE_BIT) ) {
+-      return (enable_tex_cube( ctx, unit ) &&
+-	      update_tex_common( ctx, unit ));
+-   }
+-   else if ( unitneeded ) {
+-      return GL_FALSE;
+-   }
+-   else {
+-      disable_tex( ctx, unit );
+-      return GL_TRUE;
++   if (!unitneeded) {
++      /* disable the unit */
++     disable_tex_obj_state(rmesa, unit);
++     return GL_TRUE;
+    }
++
++   if (!r200_validate_texture(ctx, ctx->Texture.Unit[unit]._Current, unit)) {
++    _mesa_warning(ctx,
++		  "failed to validate texture for unit %d.\n",
++		  unit);
++    rmesa->state.texture.unit[unit].texobj = NULL;
++    return GL_FALSE;
++  }
++
++   rmesa->state.texture.unit[unit].texobj = radeon_tex_obj(ctx->Texture.Unit[unit]._Current);
++  return GL_TRUE;
+ }
+ 
+ 
+@@ -1846,11 +1491,11 @@ void r200UpdateTextureState( GLcontext *ctx )
+ 
+    FALLBACK( rmesa, R200_FALLBACK_TEXTURE, !ok );
+ 
+-   if (rmesa->TclFallback)
++   if (rmesa->radeon.TclFallback)
+       r200ChooseVertexState( ctx );
+ 
+ 
+-   if (rmesa->r200Screen->chip_family == CHIP_FAMILY_R200) {
++   if (rmesa->radeon.radeonScreen->chip_family == CHIP_FAMILY_R200) {
+ 
+       /*
+        * T0 hang workaround -------------
+@@ -1863,7 +1508,7 @@ void r200UpdateTextureState( GLcontext *ctx )
+ 	 R200_STATECHANGE(rmesa, tex[1]);
+ 	 rmesa->hw.ctx.cmd[CTX_PP_CNTL] |= R200_TEX_1_ENABLE;
+ 	 if (!(rmesa->hw.cst.cmd[CST_PP_CNTL_X] & R200_PPX_TEX_1_ENABLE))
+-	    rmesa->hw.tex[1].cmd[TEX_PP_TXFORMAT] &= ~TEXOBJ_TXFORMAT_MASK;
++	   rmesa->hw.tex[1].cmd[TEX_PP_TXFORMAT] &= ~TEXOBJ_TXFORMAT_MASK;
+ 	 rmesa->hw.tex[1].cmd[TEX_PP_TXFORMAT] |= R200_TXFORMAT_LOOKUP_DISABLE;
+       }
+       else if (!ctx->ATIFragmentShader._Enabled) {
+diff --git a/src/mesa/drivers/dri/r200/r200_vertprog.c b/src/mesa/drivers/dri/r200/r200_vertprog.c
+index 562992f..888f91d 100644
+--- a/src/mesa/drivers/dri/r200/r200_vertprog.c
++++ b/src/mesa/drivers/dri/r200/r200_vertprog.c
+@@ -1110,9 +1110,9 @@ void r200SetupVertexProg( GLcontext *ctx ) {
+    }
+    /* could optimize setting up vertex progs away for non-tcl hw */
+    fallback = !(vp->native && r200VertexProgUpdateParams(ctx, vp) &&
+-      rmesa->r200Screen->drmSupportsVertexProgram);
++      rmesa->radeon.radeonScreen->drmSupportsVertexProgram);
+    TCL_FALLBACK(ctx, R200_TCL_FALLBACK_VERTEX_PROGRAM, fallback);
+-   if (rmesa->TclFallback) return;
++   if (rmesa->radeon.TclFallback) return;
+ 
+    R200_STATECHANGE( rmesa, vap );
+    /* FIXME: fglrx sets R200_VAP_SINGLE_BUF_STATE_ENABLE too. Do we need it?
+diff --git a/src/mesa/drivers/dri/r300/Makefile b/src/mesa/drivers/dri/r300/Makefile
+index 6ca9342..497b1ec 100644
+--- a/src/mesa/drivers/dri/r300/Makefile
++++ b/src/mesa/drivers/dri/r300/Makefile
+@@ -3,6 +3,8 @@
+ TOP = ../../../../..
+ include $(TOP)/configs/current
+ 
++CFLAGS += $(RADEON_CFLAGS)
++
+ LIBNAME = r300_dri.so
+ 
+ MINIGLX_SOURCES = server/radeon_dri.c
+@@ -20,20 +22,24 @@ COMMON_SOURCES = \
+ 	../common/xmlconfig.c \
+ 	../common/dri_util.c
+ 
++RADEON_COMMON_SOURCES = \
++	radeon_texture.c \
++	radeon_common_context.c \
++	radeon_common.c \
++	radeon_dma.c \
++	radeon_lock.c \
++	radeon_bo_legacy.c \
++	radeon_cs_legacy.c \
++	radeon_mipmap_tree.c \
++	radeon_span.c
++
+ DRIVER_SOURCES = \
+ 		 radeon_screen.c \
+-		 radeon_context.c \
+-		 radeon_ioctl.c \
+-		 radeon_lock.c \
+-		 radeon_span.c \
+-		 radeon_state.c \
+-		 r300_mem.c \
+ 		 r300_context.c \
+ 		 r300_ioctl.c \
+ 		 r300_cmdbuf.c \
+ 		 r300_state.c \
+ 		 r300_render.c \
+-		 r300_texmem.c \
+ 		 r300_tex.c \
+ 		 r300_texstate.c \
+ 		 radeon_program.c \
+@@ -49,12 +55,15 @@ DRIVER_SOURCES = \
+ 		 r300_shader.c \
+ 		 r300_emit.c \
+ 		 r300_swtcl.c \
++		 $(RADEON_COMMON_SOURCES) \
+ 		 $(EGL_SOURCES)
+ 
+ C_SOURCES = $(COMMON_SOURCES) $(DRIVER_SOURCES)
+ 
+ DRIVER_DEFINES = -DCOMPILE_R300 -DR200_MERGED=0 \
+-	-DRADEON_COMMON=1 -DRADEON_COMMON_FOR_R300
++	-DRADEON_COMMON=1 -DRADEON_COMMON_FOR_R300 \
++#	-DRADEON_BO_TRACK \
++	-Wall
+ 
+ SYMLINKS = \
+ 	server/radeon_dri.c \
+@@ -68,7 +77,28 @@ COMMON_SYMLINKS = \
+ 	radeon_chipset.h \
+ 	radeon_screen.c \
+ 	radeon_screen.h \
+-	radeon_span.h
++	radeon_span.h \
++	radeon_span.c \
++	radeon_bo_legacy.c \
++	radeon_cs_legacy.c \
++	radeon_bo_legacy.h \
++	radeon_cs_legacy.h \
++	radeon_bocs_wrapper.h \
++	radeon_lock.c \
++	radeon_lock.h \
++	radeon_common.c \
++	radeon_common.h \
++	radeon_common_context.c \
++	radeon_common_context.h \
++	radeon_cmdbuf.h \
++	radeon_dma.c \
++	radeon_dma.h \
++	radeon_mipmap_tree.c \
++	radeon_mipmap_tree.h \
++	radeon_texture.c \
++	radeon_texture.h
++
++DRI_LIB_DEPS += $(RADEON_LDFLAGS)
+ 
+ ##### TARGETS #####
+ 
+diff --git a/src/mesa/drivers/dri/r300/r300_cmdbuf.c b/src/mesa/drivers/dri/r300/r300_cmdbuf.c
+index c9e1dfe..cfc9785 100644
+--- a/src/mesa/drivers/dri/r300/r300_cmdbuf.c
++++ b/src/mesa/drivers/dri/r300/r300_cmdbuf.c
+@@ -44,235 +44,288 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ #include "drm.h"
+ #include "radeon_drm.h"
+ 
+-#include "radeon_ioctl.h"
+ #include "r300_context.h"
+ #include "r300_ioctl.h"
+ #include "radeon_reg.h"
+ #include "r300_reg.h"
+ #include "r300_cmdbuf.h"
+ #include "r300_emit.h"
++#include "radeon_bocs_wrapper.h"
++#include "radeon_mipmap_tree.h"
+ #include "r300_state.h"
++#include "radeon_reg.h"
+ 
+-// Set this to 1 for extremely verbose debugging of command buffers
+-#define DEBUG_CMDBUF		0
++#define R300_VAP_PVS_UPLOAD_ADDRESS 0x2200
++#   define RADEON_ONE_REG_WR        (1 << 15)
+ 
+-/**
+- * Send the current command buffer via ioctl to the hardware.
++/** # of dwords reserved for additional instructions that may need to be written
++ * during flushing.
+  */
+-int r300FlushCmdBufLocked(r300ContextPtr r300, const char *caller)
++#define SPACE_FOR_FLUSHING	4
++
++static unsigned packet0_count(r300ContextPtr r300, uint32_t *pkt)
+ {
+-	int ret;
+-	int i;
+-	drm_radeon_cmd_buffer_t cmd;
+-	int start;
+-
+-	if (r300->radeon.lost_context) {
+-		start = 0;
+-		r300->radeon.lost_context = GL_FALSE;
+-	} else
+-		start = r300->cmdbuf.count_reemit;
+-
+-	if (RADEON_DEBUG & DEBUG_IOCTL) {
+-		fprintf(stderr, "%s from %s - %i cliprects\n",
+-			__FUNCTION__, caller, r300->radeon.numClipRects);
+-
+-		if (DEBUG_CMDBUF && RADEON_DEBUG & DEBUG_VERBOSE)
+-			for (i = start; i < r300->cmdbuf.count_used; ++i)
+-				fprintf(stderr, "%d: %08x\n", i,
+-					r300->cmdbuf.cmd_buf[i]);
+-	}
++    if (r300->radeon.radeonScreen->kernel_mm) {
++        return ((((*pkt) >> 16) & 0x3FFF) + 1);
++    } else {
++        drm_r300_cmd_header_t *t = (drm_r300_cmd_header_t*)pkt;
++        return t->packet0.count;
 +    }
-+    bo_legacy->ptr = boml->screen->gartTextures.map + base_offset;
-+    bo_legacy->offset = boml->screen->gart_texture_offset + base_offset;
-+    bo->size = size;
-+    boml->dma_alloc_size += size;
-+    boml->dma_buf_count++;
 +    return 0;
 +}
-+
-+static int bo_dma_free(struct radeon_bo *bo)
+ 
+-	cmd.buf = (char *)(r300->cmdbuf.cmd_buf + start);
+-	cmd.bufsz = (r300->cmdbuf.count_used - start) * 4;
++#define vpu_count(ptr) (((drm_r300_cmd_header_t*)(ptr))->vpu.count)
++#define r500fp_count(ptr) (((drm_r300_cmd_header_t*)(ptr))->r500fp.count)
+ 
+-	if (r300->radeon.state.scissor.enabled) {
+-		cmd.nbox = r300->radeon.state.scissor.numClipRects;
+-		cmd.boxes =
+-		    (drm_clip_rect_t *) r300->radeon.state.scissor.pClipRects;
+-	} else {
+-		cmd.nbox = r300->radeon.numClipRects;
+-		cmd.boxes = (drm_clip_rect_t *) r300->radeon.pClipRects;
++void emit_vpu(GLcontext *ctx, struct radeon_state_atom * atom)
 +{
-+    struct bo_manager_legacy *boml = (struct bo_manager_legacy *)bo->bom;
-+    struct bo_legacy *bo_legacy = (struct bo_legacy*)bo;
-+    drm_radeon_mem_free_t memfree;
-+    int r;
++	r300ContextPtr r300 = R300_CONTEXT(ctx);
++	BATCH_LOCALS(&r300->radeon);
++	drm_r300_cmd_header_t cmd;
++	uint32_t addr, ndw, i;
++	
++	if (!r300->radeon.radeonScreen->kernel_mm) {
++		uint32_t dwords;
++		dwords = (*atom->check) (ctx, atom);
++		BEGIN_BATCH_NO_AUTOSTATE(dwords);
++		OUT_BATCH_TABLE(atom->cmd, dwords);
++		END_BATCH();
++		return;
+ 	}
+-
+-	ret = drmCommandWrite(r300->radeon.dri.fd,
+-			      DRM_RADEON_CMDBUF, &cmd, sizeof(cmd));
+-
+-	if (RADEON_DEBUG & DEBUG_SYNC) {
+-		fprintf(stderr, "Syncing in %s (from %s)\n\n",
+-			__FUNCTION__, caller);
+-		radeonWaitForIdleLocked(&r300->radeon);
++	
++	cmd.u = atom->cmd[0];
++	addr = (cmd.vpu.adrhi << 8) | cmd.vpu.adrlo;
++	ndw = cmd.vpu.count * 4;
++	if (ndw) {
++
++		if (r300->vap_flush_needed) {
++			BEGIN_BATCH_NO_AUTOSTATE(15 + ndw);
++
++			/* flush processing vertices */
++			OUT_BATCH_REGVAL(R300_SC_SCREENDOOR, 0);
++			OUT_BATCH_REGVAL(R300_RB3D_DSTCACHE_CTLSTAT, R300_RB3D_DSTCACHE_CTLSTAT_DC_FLUSH_FLUSH_DIRTY_3D);
++			OUT_BATCH_REGVAL(RADEON_WAIT_UNTIL, RADEON_WAIT_3D_IDLECLEAN);
++			OUT_BATCH_REGVAL(R300_SC_SCREENDOOR, 0xffffff);
++			OUT_BATCH_REGVAL(R300_VAP_PVS_STATE_FLUSH_REG, 0);
++			r300->vap_flush_needed = GL_FALSE;
++		} else {
++			BEGIN_BATCH_NO_AUTOSTATE(5 + ndw);
++		}
++		OUT_BATCH_REGVAL(R300_VAP_PVS_UPLOAD_ADDRESS, addr);
++		OUT_BATCH(CP_PACKET0(R300_VAP_PVS_UPLOAD_DATA, ndw-1) | RADEON_ONE_REG_WR);
++		for (i = 0; i < ndw; i++) {
++			OUT_BATCH(atom->cmd[i+1]);
++		}
++		OUT_BATCH_REGVAL(R300_VAP_PVS_STATE_FLUSH_REG, 0);
++		END_BATCH();
+ 	}
+-
+-	r300->dma.nr_released_bufs = 0;
+-	r300->cmdbuf.count_used = 0;
+-	r300->cmdbuf.count_reemit = 0;
+-
+-	return ret;
+ }
+ 
+-int r300FlushCmdBuf(r300ContextPtr r300, const char *caller)
++void emit_r500fp(GLcontext *ctx, struct radeon_state_atom * atom)
+ {
+-	int ret;
++	r300ContextPtr r300 = R300_CONTEXT(ctx);
++	BATCH_LOCALS(&r300->radeon);
++	drm_r300_cmd_header_t cmd;
++	uint32_t addr, ndw, i, sz;
++	int type, clamp, stride;
++
++	if (!r300->radeon.radeonScreen->kernel_mm) {
++		uint32_t dwords;
++		dwords = (*atom->check) (ctx, atom);
++		BEGIN_BATCH_NO_AUTOSTATE(dwords);
++		OUT_BATCH_TABLE(atom->cmd, dwords);
++		END_BATCH();
++		return;
++	}
+ 
+-	LOCK_HARDWARE(&r300->radeon);
++	cmd.u = atom->cmd[0];
++	sz = cmd.r500fp.count;
++	addr = ((cmd.r500fp.adrhi_flags & 1) << 8) | cmd.r500fp.adrlo;
++	type = !!(cmd.r500fp.adrhi_flags & R500FP_CONSTANT_TYPE);
++	clamp = !!(cmd.r500fp.adrhi_flags & R500FP_CONSTANT_CLAMP);
+ 
+-	ret = r300FlushCmdBufLocked(r300, caller);
++	addr |= (type << 16);
++	addr |= (clamp << 17);
+ 
+-	UNLOCK_HARDWARE(&r300->radeon);
++	stride = type ? 4 : 6;
+ 
+-	if (ret) {
+-		fprintf(stderr, "drmRadeonCmdBuffer: %d\n", ret);
+-		_mesa_exit(ret);
+-	}
++	ndw = sz * stride;
++	if (ndw) {
+ 
+-	return ret;
++		BEGIN_BATCH_NO_AUTOSTATE(3 + ndw);
++		OUT_BATCH(CP_PACKET0(R500_GA_US_VECTOR_INDEX, 0));
++		OUT_BATCH(addr);
++		OUT_BATCH(CP_PACKET0(R500_GA_US_VECTOR_DATA, ndw-1) | RADEON_ONE_REG_WR);
++		for (i = 0; i < ndw; i++) {
++			OUT_BATCH(atom->cmd[i+1]);
++		}
++		END_BATCH();
++	}
+ }
+ 
+-static void r300PrintStateAtom(r300ContextPtr r300, struct r300_state_atom *state)
++static void emit_tex_offsets(GLcontext *ctx, struct radeon_state_atom * atom)
+ {
+-	int i;
+-	int dwords = (*state->check) (r300, state);
+-
+-	fprintf(stderr, "  emit %s %d/%d\n", state->name, dwords,
+-		state->cmd_size);
++	r300ContextPtr r300 = R300_CONTEXT(ctx);
++	BATCH_LOCALS(&r300->radeon);
++	int numtmus = packet0_count(r300, r300->hw.tex.offset.cmd);
++	int notexture = 0;
++
++	if (numtmus) {
++		int i;
++
++		for(i = 0; i < numtmus; ++i) {
++		    radeonTexObj *t = r300->hw.textures[i];
++		
++		    if (!t)
++			notexture = 1;
++		}
+ 
+-	if (RADEON_DEBUG & DEBUG_VERBOSE) {
+-		for (i = 0; i < dwords; i++) {
+-			fprintf(stderr, "      %s[%d]: %08x\n",
+-				state->name, i, state->cmd[i]);
++		if (r300->radeon.radeonScreen->kernel_mm && notexture) {
++			return;
+ 		}
++		BEGIN_BATCH_NO_AUTOSTATE(4 * numtmus);
++		for(i = 0; i < numtmus; ++i) {
++		    radeonTexObj *t = r300->hw.textures[i];
++		    OUT_BATCH_REGSEQ(R300_TX_OFFSET_0 + (i * 4), 1);
++		    if (t && !t->image_override) {
++			    OUT_BATCH_RELOC(t->tile_bits, t->mt->bo, 0,
++					    RADEON_GEM_DOMAIN_GTT|RADEON_GEM_DOMAIN_VRAM, 0, 0);
++		    } else if (!t) {
++			    OUT_BATCH(r300->radeon.radeonScreen->texOffset[0]);
++		    } else { /* override cases */
++			    if (t->bo) {
++				    OUT_BATCH_RELOC(t->tile_bits, t->bo, 0,
++						    RADEON_GEM_DOMAIN_GTT|RADEON_GEM_DOMAIN_VRAM, 0, 0);
++			    } else if (!r300->radeon.radeonScreen->kernel_mm) {
++				    OUT_BATCH(t->override_offset);
++			    }
++			    else
++			    	OUT_BATCH(r300->radeon.radeonScreen->texOffset[0]);
++		    }
++		}
++		END_BATCH();
+ 	}
+ }
+ 
+-/**
+- * Emit all atoms with a dirty field equal to dirty.
+- *
+- * The caller must have ensured that there is enough space in the command
+- * buffer.
+- */
+-static INLINE void r300EmitAtoms(r300ContextPtr r300, GLboolean dirty)
++static void emit_cb_offset(GLcontext *ctx, struct radeon_state_atom * atom)
+ {
+-	struct r300_state_atom *atom;
+-	uint32_t *dest;
+-	int dwords;
+-
+-	dest = r300->cmdbuf.cmd_buf + r300->cmdbuf.count_used;
+-
+-	/* Emit WAIT */
+-	*dest = cmdwait(R300_WAIT_3D | R300_WAIT_3D_CLEAN);
+-	dest++;
+-	r300->cmdbuf.count_used++;
+-
+-	/* Emit cache flush */
+-	*dest = cmdpacket0(R300_TX_INVALTAGS, 1);
+-	dest++;
+-	r300->cmdbuf.count_used++;
+-
+-	*dest = R300_TX_FLUSH;
+-	dest++;
+-	r300->cmdbuf.count_used++;
+-
+-	/* Emit END3D */
+-	*dest = cmdpacify();
+-	dest++;
+-	r300->cmdbuf.count_used++;
+-
+-	/* Emit actual atoms */
+-
+-	foreach(atom, &r300->hw.atomlist) {
+-		if ((atom->dirty || r300->hw.all_dirty) == dirty) {
+-			dwords = (*atom->check) (r300, atom);
+-			if (dwords) {
+-				if (DEBUG_CMDBUF && RADEON_DEBUG & DEBUG_STATE) {
+-					r300PrintStateAtom(r300, atom);
+-				}
+-				memcpy(dest, atom->cmd, dwords * 4);
+-				dest += dwords;
+-				r300->cmdbuf.count_used += dwords;
+-				atom->dirty = GL_FALSE;
+-			} else {
+-				if (DEBUG_CMDBUF && RADEON_DEBUG & DEBUG_STATE) {
+-					fprintf(stderr, "  skip state %s\n",
+-						atom->name);
+-				}
+-			}
+-		}
++	r300ContextPtr r300 = R300_CONTEXT(ctx);
++	BATCH_LOCALS(&r300->radeon);
++	struct radeon_renderbuffer *rrb;
++	uint32_t cbpitch;
 +
-+    if (bo_legacy->ptr == NULL) {
-+        /* ptr is set to NULL if dma allocation failed */
-+        return 0;
-+    }
-+    legacy_get_current_age(boml);
-+    memfree.region = RADEON_MEM_REGION_GART;
-+    memfree.region_offset  = bo_legacy->offset;
-+    memfree.region_offset -= boml->screen->gart_texture_offset;
-+    r = drmCommandWrite(boml->base.fd,
-+                        DRM_RADEON_FREE,
-+                        &memfree,
-+                        sizeof(memfree));
-+    if (r) {
-+        fprintf(stderr, "Failed to free bo[%p] at %08x\n",
-+                &bo_legacy->base, memfree.region_offset);
-+        fprintf(stderr, "ret = %s\n", strerror(-r));
-+        return r;
-+    }
-+    boml->dma_alloc_size -= bo_legacy->base.size;
-+    boml->dma_buf_count--;
-+    return 0;
++	rrb = radeon_get_colorbuffer(&r300->radeon);
++	if (!rrb || !rrb->bo) {
++		fprintf(stderr, "no rrb\n");
++		return;
+ 	}
++
++	cbpitch = (rrb->pitch / rrb->cpp);
++	if (rrb->cpp == 4)
++		cbpitch |= R300_COLOR_FORMAT_ARGB8888;
++	else
++		cbpitch |= R300_COLOR_FORMAT_RGB565;
++
++	if (rrb->bo->flags & RADEON_BO_FLAGS_MACRO_TILE)
++		cbpitch |= R300_COLOR_TILE_ENABLE;
++
++	BEGIN_BATCH_NO_AUTOSTATE(6);
++	OUT_BATCH_REGSEQ(R300_RB3D_COLOROFFSET0, 1);
++	OUT_BATCH_RELOC(0, rrb->bo, 0, 0, RADEON_GEM_DOMAIN_VRAM, 0);
++	OUT_BATCH_REGSEQ(R300_RB3D_COLORPITCH0, 1);
++	OUT_BATCH(cbpitch);
++	END_BATCH();
+ }
+ 
+-/**
+- * Copy dirty hardware state atoms into the command buffer.
+- *
+- * We also copy out clean state if we're at the start of a buffer. That makes
+- * it easy to recover from lost contexts.
+- */
+-void r300EmitState(r300ContextPtr r300)
++static void emit_zb_offset(GLcontext *ctx, struct radeon_state_atom * atom)
+ {
+-	if (RADEON_DEBUG & (DEBUG_STATE | DEBUG_PRIMS))
+-		fprintf(stderr, "%s\n", __FUNCTION__);
++	r300ContextPtr r300 = R300_CONTEXT(ctx);
++	BATCH_LOCALS(&r300->radeon);
++	struct radeon_renderbuffer *rrb;
++	uint32_t zbpitch;
+ 
+-	if (r300->cmdbuf.count_used && !r300->hw.is_dirty
+-	    && !r300->hw.all_dirty)
++	rrb = radeon_get_depthbuffer(&r300->radeon);
++	if (!rrb)
+ 		return;
+ 
+-	/* To avoid going across the entire set of states multiple times, just check
+-	 * for enough space for the case of emitting all state, and inline the
+-	 * r300AllocCmdBuf code here without all the checks.
+-	 */
+-	r300EnsureCmdBufSpace(r300, r300->hw.max_state_size, __FUNCTION__);
+-
+-	if (!r300->cmdbuf.count_used) {
+-		if (RADEON_DEBUG & DEBUG_STATE)
+-			fprintf(stderr, "Begin reemit state\n");
+-
+-		r300EmitAtoms(r300, GL_FALSE);
+-		r300->cmdbuf.count_reemit = r300->cmdbuf.count_used;
++	zbpitch = (rrb->pitch / rrb->cpp);
++	if (rrb->bo->flags & RADEON_BO_FLAGS_MACRO_TILE) {
++		zbpitch |= R300_DEPTHMACROTILE_ENABLE;
+ 	}
++	if (rrb->bo->flags & RADEON_BO_FLAGS_MICRO_TILE){
++		zbpitch |= R300_DEPTHMICROTILE_TILED;
++	}
++	
++	BEGIN_BATCH_NO_AUTOSTATE(6);
++	OUT_BATCH_REGSEQ(R300_ZB_DEPTHOFFSET, 1);
++	OUT_BATCH_RELOC(0, rrb->bo, 0, 0, RADEON_GEM_DOMAIN_VRAM, 0);
++	OUT_BATCH_REGVAL(R300_ZB_DEPTHPITCH, zbpitch);
++	END_BATCH();
 +}
+ 
+-	if (RADEON_DEBUG & DEBUG_STATE)
+-		fprintf(stderr, "Begin dirty state\n");
+-
+-	r300EmitAtoms(r300, GL_TRUE);
+-
+-	assert(r300->cmdbuf.count_used < r300->cmdbuf.size);
++static void emit_zstencil_format(GLcontext *ctx, struct radeon_state_atom * atom)
++{
++	r300ContextPtr r300 = R300_CONTEXT(ctx);
++	BATCH_LOCALS(&r300->radeon);
++	struct radeon_renderbuffer *rrb;
++	uint32_t zbpitch;
++	uint32_t format;
 +
-+static void bo_free(struct bo_legacy *bo_legacy)
++	rrb = radeon_get_depthbuffer(&r300->radeon);
++	if (!rrb)
++	  format = 0;
++	else {
++	  if (rrb->cpp == 2)
++	    format = R300_DEPTHFORMAT_16BIT_INT_Z;
++	  else if (rrb->cpp == 4)
++	    format = R300_DEPTHFORMAT_24BIT_INT_Z_8BIT_STENCIL;
++	}
+ 
+-	r300->hw.is_dirty = GL_FALSE;
+-	r300->hw.all_dirty = GL_FALSE;
++	OUT_BATCH(atom->cmd[0]);
++	atom->cmd[1] &= ~(3 << 0);
++	atom->cmd[1] |= format;
++	OUT_BATCH(atom->cmd[1]);
++	OUT_BATCH(atom->cmd[2]);
++	OUT_BATCH(atom->cmd[3]);
++	OUT_BATCH(atom->cmd[4]);
+ }
+ 
+-#define packet0_count(ptr) (((drm_r300_cmd_header_t*)(ptr))->packet0.count)
+-#define vpu_count(ptr) (((drm_r300_cmd_header_t*)(ptr))->vpu.count)
+-#define r500fp_count(ptr) (((drm_r300_cmd_header_t*)(ptr))->r500fp.count)
+-
+-static int check_always(r300ContextPtr r300, struct r300_state_atom *atom)
++static int check_always(GLcontext *ctx, struct radeon_state_atom *atom)
+ {
+ 	return atom->cmd_size;
+ }
+ 
+-static int check_variable(r300ContextPtr r300, struct r300_state_atom *atom)
++static int check_variable(GLcontext *ctx, struct radeon_state_atom *atom)
+ {
++	r300ContextPtr r300 = R300_CONTEXT(ctx);
+ 	int cnt;
+-	cnt = packet0_count(atom->cmd);
++	if (atom->cmd[0] == CP_PACKET2) {
++		return 0;
++	}
++	cnt = packet0_count(r300, atom->cmd);
+ 	return cnt ? cnt + 1 : 0;
+ }
+ 
+-static int check_vpu(r300ContextPtr r300, struct r300_state_atom *atom)
++int check_vpu(GLcontext *ctx, struct radeon_state_atom *atom)
+ {
+ 	int cnt;
++
+ 	cnt = vpu_count(atom->cmd);
+ 	return cnt ? (cnt * 4) + 1 : 0;
+ }
+ 
+-static int check_r500fp(r300ContextPtr r300, struct r300_state_atom *atom)
++int check_r500fp(GLcontext *ctx, struct radeon_state_atom *atom)
+ {
+ 	int cnt;
++
+ 	cnt = r500fp_count(atom->cmd);
+ 	return cnt ? (cnt * 6) + 1 : 0;
+ }
+ 
+-static int check_r500fp_const(r300ContextPtr r300, struct r300_state_atom *atom)
++int check_r500fp_const(GLcontext *ctx, struct radeon_state_atom *atom)
+ {
+ 	int cnt;
++
+ 	cnt = r500fp_count(atom->cmd);
+ 	return cnt ? (cnt * 4) + 1 : 0;
+ }
+@@ -285,8 +338,8 @@ static int check_r500fp_const(r300ContextPtr r300, struct r300_state_atom *atom)
+       r300->hw.ATOM.idx = (IDX);					\
+       r300->hw.ATOM.check = check_##CHK;				\
+       r300->hw.ATOM.dirty = GL_FALSE;					\
+-      r300->hw.max_state_size += (SZ);					\
+-      insert_at_tail(&r300->hw.atomlist, &r300->hw.ATOM);		\
++      r300->radeon.hw.max_state_size += (SZ);					\
++      insert_at_tail(&r300->radeon.hw.atomlist, &r300->hw.ATOM);		\
+    } while (0)
+ /**
+  * Allocate memory for the command buffer and initialize the state atom
+@@ -294,7 +347,7 @@ static int check_r500fp_const(r300ContextPtr r300, struct r300_state_atom *atom)
+  */
+ void r300InitCmdBuf(r300ContextPtr r300)
+ {
+-	int size, mtu;
++	int mtu;
+ 	int has_tcl = 1;
+ 	int is_r500 = 0;
+ 	int i;
+@@ -305,7 +358,7 @@ void r300InitCmdBuf(r300ContextPtr r300)
+ 	if (r300->radeon.radeonScreen->chip_family >= CHIP_FAMILY_RV515)
+ 		is_r500 = 1;
+ 
+-	r300->hw.max_state_size = 2 + 2;	/* reserve extra space for WAIT_IDLE and tex cache flush */
++	r300->radeon.hw.max_state_size = 2 + 2;	/* reserve extra space for WAIT_IDLE and tex cache flush */
+ 
+ 	mtu = r300->radeon.glCtx->Const.MaxTextureUnits;
+ 	if (RADEON_DEBUG & DEBUG_TEXTURE) {
+@@ -313,97 +366,97 @@ void r300InitCmdBuf(r300ContextPtr r300)
+ 	}
+ 
+ 	/* Setup the atom linked list */
+-	make_empty_list(&r300->hw.atomlist);
+-	r300->hw.atomlist.name = "atom-list";
++	make_empty_list(&r300->radeon.hw.atomlist);
++	r300->radeon.hw.atomlist.name = "atom-list";
+ 
+ 	/* Initialize state atoms */
+ 	ALLOC_STATE(vpt, always, R300_VPT_CMDSIZE, 0);
+-	r300->hw.vpt.cmd[R300_VPT_CMD_0] = cmdpacket0(R300_SE_VPORT_XSCALE, 6);
++	r300->hw.vpt.cmd[R300_VPT_CMD_0] = cmdpacket0(r300->radeon.radeonScreen, R300_SE_VPORT_XSCALE, 6);
+ 	ALLOC_STATE(vap_cntl, always, R300_VAP_CNTL_SIZE, 0);
+-	r300->hw.vap_cntl.cmd[R300_VAP_CNTL_FLUSH] = cmdpacket0(R300_VAP_PVS_STATE_FLUSH_REG, 1);
++	r300->hw.vap_cntl.cmd[R300_VAP_CNTL_FLUSH] = cmdpacket0(r300->radeon.radeonScreen, R300_VAP_PVS_STATE_FLUSH_REG, 1);
+ 	r300->hw.vap_cntl.cmd[R300_VAP_CNTL_FLUSH_1] = 0;
+-	r300->hw.vap_cntl.cmd[R300_VAP_CNTL_CMD] = cmdpacket0(R300_VAP_CNTL, 1);
++	r300->hw.vap_cntl.cmd[R300_VAP_CNTL_CMD] = cmdpacket0(r300->radeon.radeonScreen, R300_VAP_CNTL, 1);
+ 	if (is_r500) {
+ 	    ALLOC_STATE(vap_index_offset, always, 2, 0);
+-	    r300->hw.vap_index_offset.cmd[0] = cmdpacket0(R500_VAP_INDEX_OFFSET, 1);
++	    r300->hw.vap_index_offset.cmd[0] = cmdpacket0(r300->radeon.radeonScreen, R500_VAP_INDEX_OFFSET, 1);
+ 	    r300->hw.vap_index_offset.cmd[1] = 0;
+ 	}
+ 	ALLOC_STATE(vte, always, 3, 0);
+-	r300->hw.vte.cmd[0] = cmdpacket0(R300_SE_VTE_CNTL, 2);
++	r300->hw.vte.cmd[0] = cmdpacket0(r300->radeon.radeonScreen, R300_SE_VTE_CNTL, 2);
+ 	ALLOC_STATE(vap_vf_max_vtx_indx, always, 3, 0);
+-	r300->hw.vap_vf_max_vtx_indx.cmd[0] = cmdpacket0(R300_VAP_VF_MAX_VTX_INDX, 2);
++	r300->hw.vap_vf_max_vtx_indx.cmd[0] = cmdpacket0(r300->radeon.radeonScreen, R300_VAP_VF_MAX_VTX_INDX, 2);
+ 	ALLOC_STATE(vap_cntl_status, always, 2, 0);
+-	r300->hw.vap_cntl_status.cmd[0] = cmdpacket0(R300_VAP_CNTL_STATUS, 1);
++	r300->hw.vap_cntl_status.cmd[0] = cmdpacket0(r300->radeon.radeonScreen, R300_VAP_CNTL_STATUS, 1);
+ 	ALLOC_STATE(vir[0], variable, R300_VIR_CMDSIZE, 0);
+ 	r300->hw.vir[0].cmd[R300_VIR_CMD_0] =
+-	    cmdpacket0(R300_VAP_PROG_STREAM_CNTL_0, 1);
++	    cmdpacket0(r300->radeon.radeonScreen, R300_VAP_PROG_STREAM_CNTL_0, 1);
+ 	ALLOC_STATE(vir[1], variable, R300_VIR_CMDSIZE, 1);
+ 	r300->hw.vir[1].cmd[R300_VIR_CMD_0] =
+-	    cmdpacket0(R300_VAP_PROG_STREAM_CNTL_EXT_0, 1);
++	    cmdpacket0(r300->radeon.radeonScreen, R300_VAP_PROG_STREAM_CNTL_EXT_0, 1);
+ 	ALLOC_STATE(vic, always, R300_VIC_CMDSIZE, 0);
+-	r300->hw.vic.cmd[R300_VIC_CMD_0] = cmdpacket0(R300_VAP_VTX_STATE_CNTL, 2);
++	r300->hw.vic.cmd[R300_VIC_CMD_0] = cmdpacket0(r300->radeon.radeonScreen, R300_VAP_VTX_STATE_CNTL, 2);
+ 	ALLOC_STATE(vap_psc_sgn_norm_cntl, always, 2, 0);
+-	r300->hw.vap_psc_sgn_norm_cntl.cmd[0] = cmdpacket0(R300_VAP_PSC_SGN_NORM_CNTL, SGN_NORM_ZERO_CLAMP_MINUS_ONE);
++	r300->hw.vap_psc_sgn_norm_cntl.cmd[0] = cmdpacket0(r300->radeon.radeonScreen, R300_VAP_PSC_SGN_NORM_CNTL, SGN_NORM_ZERO_CLAMP_MINUS_ONE);
+ 
+ 	if (has_tcl) {
+ 		ALLOC_STATE(vap_clip_cntl, always, 2, 0);
+-		r300->hw.vap_clip_cntl.cmd[0] = cmdpacket0(R300_VAP_CLIP_CNTL, 1);
++		r300->hw.vap_clip_cntl.cmd[0] = cmdpacket0(r300->radeon.radeonScreen, R300_VAP_CLIP_CNTL, 1);
+ 		ALLOC_STATE(vap_clip, always, 5, 0);
+-		r300->hw.vap_clip.cmd[0] = cmdpacket0(R300_VAP_GB_VERT_CLIP_ADJ, 4);
++		r300->hw.vap_clip.cmd[0] = cmdpacket0(r300->radeon.radeonScreen, R300_VAP_GB_VERT_CLIP_ADJ, 4);
+ 		ALLOC_STATE(vap_pvs_vtx_timeout_reg, always, 2, 0);
+-		r300->hw.vap_pvs_vtx_timeout_reg.cmd[0] = cmdpacket0(VAP_PVS_VTX_TIMEOUT_REG, 1);
++		r300->hw.vap_pvs_vtx_timeout_reg.cmd[0] = cmdpacket0(r300->radeon.radeonScreen, VAP_PVS_VTX_TIMEOUT_REG, 1);
+ 	}
+ 
+ 	ALLOC_STATE(vof, always, R300_VOF_CMDSIZE, 0);
+ 	r300->hw.vof.cmd[R300_VOF_CMD_0] =
+-	    cmdpacket0(R300_VAP_OUTPUT_VTX_FMT_0, 2);
++	    cmdpacket0(r300->radeon.radeonScreen, R300_VAP_OUTPUT_VTX_FMT_0, 2);
+ 
+ 	if (has_tcl) {
+ 		ALLOC_STATE(pvs, always, R300_PVS_CMDSIZE, 0);
+ 		r300->hw.pvs.cmd[R300_PVS_CMD_0] =
+-		    cmdpacket0(R300_VAP_PVS_CODE_CNTL_0, 3);
++		    cmdpacket0(r300->radeon.radeonScreen, R300_VAP_PVS_CODE_CNTL_0, 3);
+ 	}
+ 
+ 	ALLOC_STATE(gb_enable, always, 2, 0);
+-	r300->hw.gb_enable.cmd[0] = cmdpacket0(R300_GB_ENABLE, 1);
++	r300->hw.gb_enable.cmd[0] = cmdpacket0(r300->radeon.radeonScreen, R300_GB_ENABLE, 1);
+ 	ALLOC_STATE(gb_misc, always, R300_GB_MISC_CMDSIZE, 0);
+-	r300->hw.gb_misc.cmd[0] = cmdpacket0(R300_GB_MSPOS0, 5);
++	r300->hw.gb_misc.cmd[0] = cmdpacket0(r300->radeon.radeonScreen, R300_GB_MSPOS0, 5);
+ 	ALLOC_STATE(txe, always, R300_TXE_CMDSIZE, 0);
+-	r300->hw.txe.cmd[R300_TXE_CMD_0] = cmdpacket0(R300_TX_ENABLE, 1);
++	r300->hw.txe.cmd[R300_TXE_CMD_0] = cmdpacket0(r300->radeon.radeonScreen, R300_TX_ENABLE, 1);
+ 	ALLOC_STATE(ga_point_s0, always, 5, 0);
+-	r300->hw.ga_point_s0.cmd[0] = cmdpacket0(R300_GA_POINT_S0, 4);
++	r300->hw.ga_point_s0.cmd[0] = cmdpacket0(r300->radeon.radeonScreen, R300_GA_POINT_S0, 4);
+ 	ALLOC_STATE(ga_triangle_stipple, always, 2, 0);
+-	r300->hw.ga_triangle_stipple.cmd[0] = cmdpacket0(R300_GA_TRIANGLE_STIPPLE, 1);
++	r300->hw.ga_triangle_stipple.cmd[0] = cmdpacket0(r300->radeon.radeonScreen, R300_GA_TRIANGLE_STIPPLE, 1);
+ 	ALLOC_STATE(ps, always, R300_PS_CMDSIZE, 0);
+-	r300->hw.ps.cmd[0] = cmdpacket0(R300_GA_POINT_SIZE, 1);
++	r300->hw.ps.cmd[0] = cmdpacket0(r300->radeon.radeonScreen, R300_GA_POINT_SIZE, 1);
+ 	ALLOC_STATE(ga_point_minmax, always, 4, 0);
+-	r300->hw.ga_point_minmax.cmd[0] = cmdpacket0(R300_GA_POINT_MINMAX, 3);
++	r300->hw.ga_point_minmax.cmd[0] = cmdpacket0(r300->radeon.radeonScreen, R300_GA_POINT_MINMAX, 3);
+ 	ALLOC_STATE(lcntl, always, 2, 0);
+-	r300->hw.lcntl.cmd[0] = cmdpacket0(R300_GA_LINE_CNTL, 1);
++	r300->hw.lcntl.cmd[0] = cmdpacket0(r300->radeon.radeonScreen, R300_GA_LINE_CNTL, 1);
+ 	ALLOC_STATE(ga_line_stipple, always, 4, 0);
+-	r300->hw.ga_line_stipple.cmd[0] = cmdpacket0(R300_GA_LINE_STIPPLE_VALUE, 3);
++	r300->hw.ga_line_stipple.cmd[0] = cmdpacket0(r300->radeon.radeonScreen, R300_GA_LINE_STIPPLE_VALUE, 3);
+ 	ALLOC_STATE(shade, always, 5, 0);
+-	r300->hw.shade.cmd[0] = cmdpacket0(R300_GA_ENHANCE, 4);
++	r300->hw.shade.cmd[0] = cmdpacket0(r300->radeon.radeonScreen, R300_GA_ENHANCE, 4);
+ 	ALLOC_STATE(polygon_mode, always, 4, 0);
+-	r300->hw.polygon_mode.cmd[0] = cmdpacket0(R300_GA_POLY_MODE, 3);
++	r300->hw.polygon_mode.cmd[0] = cmdpacket0(r300->radeon.radeonScreen, R300_GA_POLY_MODE, 3);
+ 	ALLOC_STATE(fogp, always, 3, 0);
+-	r300->hw.fogp.cmd[0] = cmdpacket0(R300_GA_FOG_SCALE, 2);
++	r300->hw.fogp.cmd[0] = cmdpacket0(r300->radeon.radeonScreen, R300_GA_FOG_SCALE, 2);
+ 	ALLOC_STATE(zbias_cntl, always, 2, 0);
+-	r300->hw.zbias_cntl.cmd[0] = cmdpacket0(R300_SU_TEX_WRAP, 1);
++	r300->hw.zbias_cntl.cmd[0] = cmdpacket0(r300->radeon.radeonScreen, R300_SU_TEX_WRAP, 1);
+ 	ALLOC_STATE(zbs, always, R300_ZBS_CMDSIZE, 0);
+ 	r300->hw.zbs.cmd[R300_ZBS_CMD_0] =
+-	    cmdpacket0(R300_SU_POLY_OFFSET_FRONT_SCALE, 4);
++	    cmdpacket0(r300->radeon.radeonScreen, R300_SU_POLY_OFFSET_FRONT_SCALE, 4);
+ 	ALLOC_STATE(occlusion_cntl, always, 2, 0);
+-	r300->hw.occlusion_cntl.cmd[0] = cmdpacket0(R300_SU_POLY_OFFSET_ENABLE, 1);
++	r300->hw.occlusion_cntl.cmd[0] = cmdpacket0(r300->radeon.radeonScreen, R300_SU_POLY_OFFSET_ENABLE, 1);
+ 	ALLOC_STATE(cul, always, R300_CUL_CMDSIZE, 0);
+-	r300->hw.cul.cmd[R300_CUL_CMD_0] = cmdpacket0(R300_SU_CULL_MODE, 1);
++	r300->hw.cul.cmd[R300_CUL_CMD_0] = cmdpacket0(r300->radeon.radeonScreen, R300_SU_CULL_MODE, 1);
+ 	ALLOC_STATE(su_depth_scale, always, 3, 0);
+-	r300->hw.su_depth_scale.cmd[0] = cmdpacket0(R300_SU_DEPTH_SCALE, 2);
++	r300->hw.su_depth_scale.cmd[0] = cmdpacket0(r300->radeon.radeonScreen, R300_SU_DEPTH_SCALE, 2);
+ 	ALLOC_STATE(rc, always, R300_RC_CMDSIZE, 0);
+-	r300->hw.rc.cmd[R300_RC_CMD_0] = cmdpacket0(R300_RS_COUNT, 2);
++	r300->hw.rc.cmd[R300_RC_CMD_0] = cmdpacket0(r300->radeon.radeonScreen, R300_RS_COUNT, 2);
+ 	if (is_r500) {
+ 		ALLOC_STATE(ri, always, R500_RI_CMDSIZE, 0);
+-		r300->hw.ri.cmd[R300_RI_CMD_0] = cmdpacket0(R500_RS_IP_0, 16);
++		r300->hw.ri.cmd[R300_RI_CMD_0] = cmdpacket0(r300->radeon.radeonScreen, R500_RS_IP_0, 16);
+ 		for (i = 0; i < 8; i++) {
+ 			r300->hw.ri.cmd[R300_RI_CMD_0 + i +1] =
+ 			  (R500_RS_IP_PTR_K0 << R500_RS_IP_TEX_PTR_S_SHIFT) |
+@@ -412,133 +465,149 @@ void r300InitCmdBuf(r300ContextPtr r300)
+                           (R500_RS_IP_PTR_K1 << R500_RS_IP_TEX_PTR_Q_SHIFT);
+ 		}
+ 		ALLOC_STATE(rr, variable, R300_RR_CMDSIZE, 0);
+-		r300->hw.rr.cmd[R300_RR_CMD_0] = cmdpacket0(R500_RS_INST_0, 1);
++		r300->hw.rr.cmd[R300_RR_CMD_0] = cmdpacket0(r300->radeon.radeonScreen, R500_RS_INST_0, 1);
+ 	} else {
+ 		ALLOC_STATE(ri, always, R300_RI_CMDSIZE, 0);
+-		r300->hw.ri.cmd[R300_RI_CMD_0] = cmdpacket0(R300_RS_IP_0, 8);
++		r300->hw.ri.cmd[R300_RI_CMD_0] = cmdpacket0(r300->radeon.radeonScreen, R300_RS_IP_0, 8);
+ 		ALLOC_STATE(rr, variable, R300_RR_CMDSIZE, 0);
+-		r300->hw.rr.cmd[R300_RR_CMD_0] = cmdpacket0(R300_RS_INST_0, 1);
++		r300->hw.rr.cmd[R300_RR_CMD_0] = cmdpacket0(r300->radeon.radeonScreen, R300_RS_INST_0, 1);
+ 	}
+ 	ALLOC_STATE(sc_hyperz, always, 3, 0);
+-	r300->hw.sc_hyperz.cmd[0] = cmdpacket0(R300_SC_HYPERZ, 2);
++	r300->hw.sc_hyperz.cmd[0] = cmdpacket0(r300->radeon.radeonScreen, R300_SC_HYPERZ, 2);
+ 	ALLOC_STATE(sc_screendoor, always, 2, 0);
+-	r300->hw.sc_screendoor.cmd[0] = cmdpacket0(R300_SC_SCREENDOOR, 1);
++	r300->hw.sc_screendoor.cmd[0] = cmdpacket0(r300->radeon.radeonScreen, R300_SC_SCREENDOOR, 1);
+ 	ALLOC_STATE(us_out_fmt, always, 6, 0);
+-	r300->hw.us_out_fmt.cmd[0] = cmdpacket0(R300_US_OUT_FMT, 5);
++	r300->hw.us_out_fmt.cmd[0] = cmdpacket0(r300->radeon.radeonScreen, R300_US_OUT_FMT, 5);
+ 
+ 	if (is_r500) {
+ 		ALLOC_STATE(fp, always, R500_FP_CMDSIZE, 0);
+-		r300->hw.fp.cmd[R500_FP_CMD_0] = cmdpacket0(R500_US_CONFIG, 2);
++		r300->hw.fp.cmd[R500_FP_CMD_0] = cmdpacket0(r300->radeon.radeonScreen, R500_US_CONFIG, 2);
+ 		r300->hw.fp.cmd[R500_FP_CNTL] = R500_ZERO_TIMES_ANYTHING_EQUALS_ZERO;
+-		r300->hw.fp.cmd[R500_FP_CMD_1] = cmdpacket0(R500_US_CODE_ADDR, 3);
+-		r300->hw.fp.cmd[R500_FP_CMD_2] = cmdpacket0(R500_US_FC_CTRL, 1);
++		r300->hw.fp.cmd[R500_FP_CMD_1] = cmdpacket0(r300->radeon.radeonScreen, R500_US_CODE_ADDR, 3);
++		r300->hw.fp.cmd[R500_FP_CMD_2] = cmdpacket0(r300->radeon.radeonScreen, R500_US_FC_CTRL, 1);
+ 		r300->hw.fp.cmd[R500_FP_FC_CNTL] = 0; /* FIXME when we add flow control */
+ 
+ 		ALLOC_STATE(r500fp, r500fp, R500_FPI_CMDSIZE, 0);
+-		r300->hw.r500fp.cmd[R300_FPI_CMD_0] = cmdr500fp(0, 0, 0, 0);
++		r300->hw.r500fp.cmd[R300_FPI_CMD_0] =
++			cmdr500fp(r300->radeon.radeonScreen, 0, 0, 0, 0);
++		r300->hw.r500fp.emit = emit_r500fp;
+ 		ALLOC_STATE(r500fp_const, r500fp_const, R500_FPP_CMDSIZE, 0);
+-		r300->hw.r500fp_const.cmd[R300_FPI_CMD_0] = cmdr500fp(0, 0, 1, 0);
++		r300->hw.r500fp_const.cmd[R300_FPI_CMD_0] =
++			cmdr500fp(r300->radeon.radeonScreen, 0, 0, 1, 0);
++		r300->hw.r500fp_const.emit = emit_r500fp;
+ 	} else {
+ 		ALLOC_STATE(fp, always, R300_FP_CMDSIZE, 0);
+-		r300->hw.fp.cmd[R300_FP_CMD_0] = cmdpacket0(R300_US_CONFIG, 3);
+-		r300->hw.fp.cmd[R300_FP_CMD_1] = cmdpacket0(R300_US_CODE_ADDR_0, 4);
++		r300->hw.fp.cmd[R300_FP_CMD_0] = cmdpacket0(r300->radeon.radeonScreen, R300_US_CONFIG, 3);
++		r300->hw.fp.cmd[R300_FP_CMD_1] = cmdpacket0(r300->radeon.radeonScreen, R300_US_CODE_ADDR_0, 4);
++
+ 		ALLOC_STATE(fpt, variable, R300_FPT_CMDSIZE, 0);
+-		r300->hw.fpt.cmd[R300_FPT_CMD_0] = cmdpacket0(R300_US_TEX_INST_0, 0);
++		r300->hw.fpt.cmd[R300_FPT_CMD_0] = cmdpacket0(r300->radeon.radeonScreen, R300_US_TEX_INST_0, 0);
+ 
+ 		ALLOC_STATE(fpi[0], variable, R300_FPI_CMDSIZE, 0);
+-		r300->hw.fpi[0].cmd[R300_FPI_CMD_0] = cmdpacket0(R300_US_ALU_RGB_INST_0, 1);
++		r300->hw.fpi[0].cmd[R300_FPI_CMD_0] = cmdpacket0(r300->radeon.radeonScreen, R300_US_ALU_RGB_INST_0, 1);
+ 		ALLOC_STATE(fpi[1], variable, R300_FPI_CMDSIZE, 1);
+-		r300->hw.fpi[1].cmd[R300_FPI_CMD_0] = cmdpacket0(R300_US_ALU_RGB_ADDR_0, 1);
++		r300->hw.fpi[1].cmd[R300_FPI_CMD_0] = cmdpacket0(r300->radeon.radeonScreen, R300_US_ALU_RGB_ADDR_0, 1);
+ 		ALLOC_STATE(fpi[2], variable, R300_FPI_CMDSIZE, 2);
+-		r300->hw.fpi[2].cmd[R300_FPI_CMD_0] = cmdpacket0(R300_US_ALU_ALPHA_INST_0, 1);
++		r300->hw.fpi[2].cmd[R300_FPI_CMD_0] = cmdpacket0(r300->radeon.radeonScreen, R300_US_ALU_ALPHA_INST_0, 1);
+ 		ALLOC_STATE(fpi[3], variable, R300_FPI_CMDSIZE, 3);
+-		r300->hw.fpi[3].cmd[R300_FPI_CMD_0] = cmdpacket0(R300_US_ALU_ALPHA_ADDR_0, 1);
++		r300->hw.fpi[3].cmd[R300_FPI_CMD_0] = cmdpacket0(r300->radeon.radeonScreen, R300_US_ALU_ALPHA_ADDR_0, 1);
+ 		ALLOC_STATE(fpp, variable, R300_FPP_CMDSIZE, 0);
+-		r300->hw.fpp.cmd[R300_FPP_CMD_0] = cmdpacket0(R300_PFS_PARAM_0_X, 0);
++		r300->hw.fpp.cmd[R300_FPP_CMD_0] = cmdpacket0(r300->radeon.radeonScreen, R300_PFS_PARAM_0_X, 0);
+ 	}
+ 	ALLOC_STATE(fogs, always, R300_FOGS_CMDSIZE, 0);
+-	r300->hw.fogs.cmd[R300_FOGS_CMD_0] = cmdpacket0(R300_FG_FOG_BLEND, 1);
++	r300->hw.fogs.cmd[R300_FOGS_CMD_0] = cmdpacket0(r300->radeon.radeonScreen, R300_FG_FOG_BLEND, 1);
+ 	ALLOC_STATE(fogc, always, R300_FOGC_CMDSIZE, 0);
+-	r300->hw.fogc.cmd[R300_FOGC_CMD_0] = cmdpacket0(R300_FG_FOG_COLOR_R, 3);
++	r300->hw.fogc.cmd[R300_FOGC_CMD_0] = cmdpacket0(r300->radeon.radeonScreen, R300_FG_FOG_COLOR_R, 3);
+ 	ALLOC_STATE(at, always, R300_AT_CMDSIZE, 0);
+-	r300->hw.at.cmd[R300_AT_CMD_0] = cmdpacket0(R300_FG_ALPHA_FUNC, 2);
++	r300->hw.at.cmd[R300_AT_CMD_0] = cmdpacket0(r300->radeon.radeonScreen, R300_FG_ALPHA_FUNC, 2);
+ 	ALLOC_STATE(fg_depth_src, always, 2, 0);
+-	r300->hw.fg_depth_src.cmd[0] = cmdpacket0(R300_FG_DEPTH_SRC, 1);
++	r300->hw.fg_depth_src.cmd[0] = cmdpacket0(r300->radeon.radeonScreen, R300_FG_DEPTH_SRC, 1);
+ 	ALLOC_STATE(rb3d_cctl, always, 2, 0);
+-	r300->hw.rb3d_cctl.cmd[0] = cmdpacket0(R300_RB3D_CCTL, 1);
++	r300->hw.rb3d_cctl.cmd[0] = cmdpacket0(r300->radeon.radeonScreen, R300_RB3D_CCTL, 1);
+ 	ALLOC_STATE(bld, always, R300_BLD_CMDSIZE, 0);
+-	r300->hw.bld.cmd[R300_BLD_CMD_0] = cmdpacket0(R300_RB3D_CBLEND, 2);
++	r300->hw.bld.cmd[R300_BLD_CMD_0] = cmdpacket0(r300->radeon.radeonScreen, R300_RB3D_CBLEND, 2);
+ 	ALLOC_STATE(cmk, always, R300_CMK_CMDSIZE, 0);
+-	r300->hw.cmk.cmd[R300_CMK_CMD_0] = cmdpacket0(RB3D_COLOR_CHANNEL_MASK, 1);
++	r300->hw.cmk.cmd[R300_CMK_CMD_0] = cmdpacket0(r300->radeon.radeonScreen, RB3D_COLOR_CHANNEL_MASK, 1);
+ 	if (is_r500) {
+ 		ALLOC_STATE(blend_color, always, 3, 0);
+-		r300->hw.blend_color.cmd[0] = cmdpacket0(R500_RB3D_CONSTANT_COLOR_AR, 2);
++		r300->hw.blend_color.cmd[0] = cmdpacket0(r300->radeon.radeonScreen, R500_RB3D_CONSTANT_COLOR_AR, 2);
+ 	} else {
+ 		ALLOC_STATE(blend_color, always, 2, 0);
+-		r300->hw.blend_color.cmd[0] = cmdpacket0(R300_RB3D_BLEND_COLOR, 1);
++		r300->hw.blend_color.cmd[0] = cmdpacket0(r300->radeon.radeonScreen, R300_RB3D_BLEND_COLOR, 1);
+ 	}
+ 	ALLOC_STATE(rop, always, 2, 0);
+-	r300->hw.rop.cmd[0] = cmdpacket0(R300_RB3D_ROPCNTL, 1);
++	r300->hw.rop.cmd[0] = cmdpacket0(r300->radeon.radeonScreen, R300_RB3D_ROPCNTL, 1);
+ 	ALLOC_STATE(cb, always, R300_CB_CMDSIZE, 0);
+-	r300->hw.cb.cmd[R300_CB_CMD_0] = cmdpacket0(R300_RB3D_COLOROFFSET0, 1);
+-	r300->hw.cb.cmd[R300_CB_CMD_1] = cmdpacket0(R300_RB3D_COLORPITCH0, 1);
++	r300->hw.cb.emit = &emit_cb_offset;
+ 	ALLOC_STATE(rb3d_dither_ctl, always, 10, 0);
+-	r300->hw.rb3d_dither_ctl.cmd[0] = cmdpacket0(R300_RB3D_DITHER_CTL, 9);
++	r300->hw.rb3d_dither_ctl.cmd[0] = cmdpacket0(r300->radeon.radeonScreen, R300_RB3D_DITHER_CTL, 9);
+ 	ALLOC_STATE(rb3d_aaresolve_ctl, always, 2, 0);
+-	r300->hw.rb3d_aaresolve_ctl.cmd[0] = cmdpacket0(R300_RB3D_AARESOLVE_CTL, 1);
++	r300->hw.rb3d_aaresolve_ctl.cmd[0] = cmdpacket0(r300->radeon.radeonScreen, R300_RB3D_AARESOLVE_CTL, 1);
+ 	ALLOC_STATE(rb3d_discard_src_pixel_lte_threshold, always, 3, 0);
+-	r300->hw.rb3d_discard_src_pixel_lte_threshold.cmd[0] = cmdpacket0(R500_RB3D_DISCARD_SRC_PIXEL_LTE_THRESHOLD, 2);
++	r300->hw.rb3d_discard_src_pixel_lte_threshold.cmd[0] = cmdpacket0(r300->radeon.radeonScreen, R500_RB3D_DISCARD_SRC_PIXEL_LTE_THRESHOLD, 2);
+ 	ALLOC_STATE(zs, always, R300_ZS_CMDSIZE, 0);
+ 	r300->hw.zs.cmd[R300_ZS_CMD_0] =
+-	    cmdpacket0(R300_ZB_CNTL, 3);
++	    cmdpacket0(r300->radeon.radeonScreen, R300_ZB_CNTL, 3);
++
+ 	ALLOC_STATE(zstencil_format, always, 5, 0);
+ 	r300->hw.zstencil_format.cmd[0] =
+-	    cmdpacket0(R300_ZB_FORMAT, 4);
++	    cmdpacket0(r300->radeon.radeonScreen, R300_ZB_FORMAT, 4);
++	r300->hw.zstencil_format.emit = emit_zstencil_format;
++
+ 	ALLOC_STATE(zb, always, R300_ZB_CMDSIZE, 0);
+-	r300->hw.zb.cmd[R300_ZB_CMD_0] = cmdpacket0(R300_ZB_DEPTHOFFSET, 2);
++	r300->hw.zb.emit = emit_zb_offset;
+ 	ALLOC_STATE(zb_depthclearvalue, always, 2, 0);
+-	r300->hw.zb_depthclearvalue.cmd[0] = cmdpacket0(R300_ZB_DEPTHCLEARVALUE, 1);
++	r300->hw.zb_depthclearvalue.cmd[0] = cmdpacket0(r300->radeon.radeonScreen, R300_ZB_DEPTHCLEARVALUE, 1);
+ 	ALLOC_STATE(unk4F30, always, 3, 0);
+-	r300->hw.unk4F30.cmd[0] = cmdpacket0(0x4F30, 2);
++	r300->hw.unk4F30.cmd[0] = cmdpacket0(r300->radeon.radeonScreen, 0x4F30, 2);
+ 	ALLOC_STATE(zb_hiz_offset, always, 2, 0);
+-	r300->hw.zb_hiz_offset.cmd[0] = cmdpacket0(R300_ZB_HIZ_OFFSET, 1);
++	r300->hw.zb_hiz_offset.cmd[0] = cmdpacket0(r300->radeon.radeonScreen, R300_ZB_HIZ_OFFSET, 1);
+ 	ALLOC_STATE(zb_hiz_pitch, always, 2, 0);
+-	r300->hw.zb_hiz_pitch.cmd[0] = cmdpacket0(R300_ZB_HIZ_PITCH, 1);
++	r300->hw.zb_hiz_pitch.cmd[0] = cmdpacket0(r300->radeon.radeonScreen, R300_ZB_HIZ_PITCH, 1);
+ 
+ 	/* VPU only on TCL */
+ 	if (has_tcl) {
+    	        int i;
+ 		ALLOC_STATE(vpi, vpu, R300_VPI_CMDSIZE, 0);
+-		r300->hw.vpi.cmd[R300_VPI_CMD_0] =
+-		    cmdvpu(R300_PVS_CODE_START, 0);
++		r300->hw.vpi.cmd[0] =
++		    cmdvpu(r300->radeon.radeonScreen, R300_PVS_CODE_START, 0);
++		r300->hw.vpi.emit = emit_vpu;
+ 
+ 		if (is_r500) {
+ 		    ALLOC_STATE(vpp, vpu, R300_VPP_CMDSIZE, 0);
+-		    r300->hw.vpp.cmd[R300_VPP_CMD_0] =
+-			cmdvpu(R500_PVS_CONST_START, 0);
++		    r300->hw.vpp.cmd[0] =
++			cmdvpu(r300->radeon.radeonScreen, R500_PVS_CONST_START, 0);
++		    r300->hw.vpp.emit = emit_vpu;
+ 
+ 		    ALLOC_STATE(vps, vpu, R300_VPS_CMDSIZE, 0);
+-		    r300->hw.vps.cmd[R300_VPS_CMD_0] =
+-			cmdvpu(R500_POINT_VPORT_SCALE_OFFSET, 1);
++		    r300->hw.vps.cmd[0] =
++			cmdvpu(r300->radeon.radeonScreen, R500_POINT_VPORT_SCALE_OFFSET, 1);
++		    r300->hw.vps.emit = emit_vpu;
+ 
+ 			for (i = 0; i < 6; i++) {
+-				ALLOC_STATE(vpucp[i], vpu, R300_VPUCP_CMDSIZE, 0);
+-				r300->hw.vpucp[i].cmd[R300_VPUCP_CMD_0] =
+-					cmdvpu(R500_PVS_UCP_START + i, 1);
++			  ALLOC_STATE(vpucp[i], vpu, R300_VPUCP_CMDSIZE, 0);
++			  r300->hw.vpucp[i].cmd[0] =
++				  cmdvpu(r300->radeon.radeonScreen,
++                           R500_PVS_UCP_START + i, 1);
++				r300->hw.vpucp[i].emit = emit_vpu;
+ 			}
+ 		} else {
+ 		    ALLOC_STATE(vpp, vpu, R300_VPP_CMDSIZE, 0);
+-		    r300->hw.vpp.cmd[R300_VPP_CMD_0] =
+-			cmdvpu(R300_PVS_CONST_START, 0);
++		    r300->hw.vpp.cmd[0] =
++			cmdvpu(r300->radeon.radeonScreen, R300_PVS_CONST_START, 0);
++		    r300->hw.vpp.emit = emit_vpu;
+ 
+ 		    ALLOC_STATE(vps, vpu, R300_VPS_CMDSIZE, 0);
+-		    r300->hw.vps.cmd[R300_VPS_CMD_0] =
+-			cmdvpu(R300_POINT_VPORT_SCALE_OFFSET, 1);
++		    r300->hw.vps.cmd[0] =
++			cmdvpu(r300->radeon.radeonScreen, R300_POINT_VPORT_SCALE_OFFSET, 1);
++		    r300->hw.vps.emit = emit_vpu;
+ 
+ 			for (i = 0; i < 6; i++) {
+ 				ALLOC_STATE(vpucp[i], vpu, R300_VPUCP_CMDSIZE, 0);
+-				r300->hw.vpucp[i].cmd[R300_VPUCP_CMD_0] =
+-					cmdvpu(R300_PVS_UCP_START + i, 1);
++				r300->hw.vpucp[i].cmd[0] =
++					cmdvpu(r300->radeon.radeonScreen,
++					       R300_PVS_UCP_START + i, 1);
++				r300->hw.vpucp[i].emit = emit_vpu;
+ 			}
+ 		}
+ 	}
+@@ -546,61 +615,39 @@ void r300InitCmdBuf(r300ContextPtr r300)
+ 	/* Textures */
+ 	ALLOC_STATE(tex.filter, variable, mtu + 1, 0);
+ 	r300->hw.tex.filter.cmd[R300_TEX_CMD_0] =
+-	    cmdpacket0(R300_TX_FILTER0_0, 0);
++	    cmdpacket0(r300->radeon.radeonScreen, R300_TX_FILTER0_0, 0);
+ 
+ 	ALLOC_STATE(tex.filter_1, variable, mtu + 1, 0);
+ 	r300->hw.tex.filter_1.cmd[R300_TEX_CMD_0] =
+-	    cmdpacket0(R300_TX_FILTER1_0, 0);
++	    cmdpacket0(r300->radeon.radeonScreen, R300_TX_FILTER1_0, 0);
+ 
+ 	ALLOC_STATE(tex.size, variable, mtu + 1, 0);
+-	r300->hw.tex.size.cmd[R300_TEX_CMD_0] = cmdpacket0(R300_TX_SIZE_0, 0);
++	r300->hw.tex.size.cmd[R300_TEX_CMD_0] = cmdpacket0(r300->radeon.radeonScreen, R300_TX_SIZE_0, 0);
+ 
+ 	ALLOC_STATE(tex.format, variable, mtu + 1, 0);
+ 	r300->hw.tex.format.cmd[R300_TEX_CMD_0] =
+-	    cmdpacket0(R300_TX_FORMAT_0, 0);
++	    cmdpacket0(r300->radeon.radeonScreen, R300_TX_FORMAT_0, 0);
+ 
+ 	ALLOC_STATE(tex.pitch, variable, mtu + 1, 0);
+-	r300->hw.tex.pitch.cmd[R300_TEX_CMD_0] = cmdpacket0(R300_TX_FORMAT2_0, 0);
++	r300->hw.tex.pitch.cmd[R300_TEX_CMD_0] = cmdpacket0(r300->radeon.radeonScreen, R300_TX_FORMAT2_0, 0);
+ 
+-	ALLOC_STATE(tex.offset, variable, mtu + 1, 0);
++	ALLOC_STATE(tex.offset, variable, 1, 0);
+ 	r300->hw.tex.offset.cmd[R300_TEX_CMD_0] =
+-	    cmdpacket0(R300_TX_OFFSET_0, 0);
++	    cmdpacket0(r300->radeon.radeonScreen, R300_TX_OFFSET_0, 0);
++	r300->hw.tex.offset.emit = &emit_tex_offsets;
+ 
+ 	ALLOC_STATE(tex.chroma_key, variable, mtu + 1, 0);
+ 	r300->hw.tex.chroma_key.cmd[R300_TEX_CMD_0] =
+-	    cmdpacket0(R300_TX_CHROMA_KEY_0, 0);
++	    cmdpacket0(r300->radeon.radeonScreen, R300_TX_CHROMA_KEY_0, 0);
+ 
+ 	ALLOC_STATE(tex.border_color, variable, mtu + 1, 0);
+ 	r300->hw.tex.border_color.cmd[R300_TEX_CMD_0] =
+-	    cmdpacket0(R300_TX_BORDER_COLOR_0, 0);
+-
+-	r300->hw.is_dirty = GL_TRUE;
+-	r300->hw.all_dirty = GL_TRUE;
++	    cmdpacket0(r300->radeon.radeonScreen, R300_TX_BORDER_COLOR_0, 0);
+ 
+-	/* Initialize command buffer */
+-	size =
+-	    256 * driQueryOptioni(&r300->radeon.optionCache,
+-				  "command_buffer_size");
+-	if (size < 2 * r300->hw.max_state_size) {
+-		size = 2 * r300->hw.max_state_size + 65535;
+-	}
+-	if (size > 64 * 256)
+-		size = 64 * 256;
+-
+-	if (RADEON_DEBUG & (DEBUG_IOCTL | DEBUG_DMA)) {
+-		fprintf(stderr, "sizeof(drm_r300_cmd_header_t)=%zd\n",
+-			sizeof(drm_r300_cmd_header_t));
+-		fprintf(stderr, "sizeof(drm_radeon_cmd_buffer_t)=%zd\n",
+-			sizeof(drm_radeon_cmd_buffer_t));
+-		fprintf(stderr,
+-			"Allocating %d bytes command buffer (max state is %d bytes)\n",
+-			size * 4, r300->hw.max_state_size * 4);
+-	}
++	r300->radeon.hw.is_dirty = GL_TRUE;
++	r300->radeon.hw.all_dirty = GL_TRUE;
+ 
+-	r300->cmdbuf.size = size;
+-	r300->cmdbuf.cmd_buf = (uint32_t *) CALLOC(size * 4);
+-	r300->cmdbuf.count_used = 0;
+-	r300->cmdbuf.count_reemit = 0;
++	rcommonInitCmdBuf(&r300->radeon);
+ }
+ 
+ /**
+@@ -608,68 +655,10 @@ void r300InitCmdBuf(r300ContextPtr r300)
+  */
+ void r300DestroyCmdBuf(r300ContextPtr r300)
+ {
+-	struct r300_state_atom *atom;
++	struct radeon_state_atom *atom;
+ 
+-	FREE(r300->cmdbuf.cmd_buf);
+-
+-	foreach(atom, &r300->hw.atomlist) {
++	foreach(atom, &r300->radeon.hw.atomlist) {
+ 		FREE(atom->cmd);
+ 	}
+-}
+-
+-void r300EmitBlit(r300ContextPtr rmesa,
+-		  GLuint color_fmt,
+-		  GLuint src_pitch,
+-		  GLuint src_offset,
+-		  GLuint dst_pitch,
+-		  GLuint dst_offset,
+-		  GLint srcx, GLint srcy,
+-		  GLint dstx, GLint dsty, GLuint w, GLuint h)
+-{
+-	drm_r300_cmd_header_t *cmd;
+-
+-	if (RADEON_DEBUG & DEBUG_IOCTL)
+-		fprintf(stderr,
+-			"%s src %x/%x %d,%d dst: %x/%x %d,%d sz: %dx%d\n",
+-			__FUNCTION__, src_pitch, src_offset, srcx, srcy,
+-			dst_pitch, dst_offset, dstx, dsty, w, h);
+-
+-	assert((src_pitch & 63) == 0);
+-	assert((dst_pitch & 63) == 0);
+-	assert((src_offset & 1023) == 0);
+-	assert((dst_offset & 1023) == 0);
+-	assert(w < (1 << 16));
+-	assert(h < (1 << 16));
+-
+-	cmd = (drm_r300_cmd_header_t *) r300AllocCmdBuf(rmesa, 8, __FUNCTION__);
+-
+-	cmd[0].header.cmd_type = R300_CMD_PACKET3;
+-	cmd[0].header.pad0 = R300_CMD_PACKET3_RAW;
+-	cmd[1].u = R300_CP_CMD_BITBLT_MULTI | (5 << 16);
+-	cmd[2].u = (RADEON_GMC_SRC_PITCH_OFFSET_CNTL |
+-		    RADEON_GMC_DST_PITCH_OFFSET_CNTL |
+-		    RADEON_GMC_BRUSH_NONE |
+-		    (color_fmt << 8) |
+-		    RADEON_GMC_SRC_DATATYPE_COLOR |
+-		    RADEON_ROP3_S |
+-		    RADEON_DP_SRC_SOURCE_MEMORY |
+-		    RADEON_GMC_CLR_CMP_CNTL_DIS | RADEON_GMC_WR_MSK_DIS);
+-
+-	cmd[3].u = ((src_pitch / 64) << 22) | (src_offset >> 10);
+-	cmd[4].u = ((dst_pitch / 64) << 22) | (dst_offset >> 10);
+-	cmd[5].u = (srcx << 16) | srcy;
+-	cmd[6].u = (dstx << 16) | dsty;	/* dst */
+-	cmd[7].u = (w << 16) | h;
+-}
+-
+-void r300EmitWait(r300ContextPtr rmesa, GLuint flags)
+-{
+-	drm_r300_cmd_header_t *cmd;
+-
+-	assert(!(flags & ~(R300_WAIT_2D | R300_WAIT_3D)));
+ 
+-	cmd = (drm_r300_cmd_header_t *) r300AllocCmdBuf(rmesa, 1, __FUNCTION__);
+-	cmd[0].u = 0;
+-	cmd[0].wait.cmd_type = R300_CMD_WAIT;
+-	cmd[0].wait.flags = flags;
+ }
+diff --git a/src/mesa/drivers/dri/r300/r300_cmdbuf.h b/src/mesa/drivers/dri/r300/r300_cmdbuf.h
+index a8eaa58..b7798eb 100644
+--- a/src/mesa/drivers/dri/r300/r300_cmdbuf.h
++++ b/src/mesa/drivers/dri/r300/r300_cmdbuf.h
+@@ -38,79 +38,15 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ #include "r300_context.h"
+ 
+-extern int r300FlushCmdBufLocked(r300ContextPtr r300, const char *caller);
+-extern int r300FlushCmdBuf(r300ContextPtr r300, const char *caller);
+-
+-extern void r300EmitState(r300ContextPtr r300);
+-
+ extern void r300InitCmdBuf(r300ContextPtr r300);
+ extern void r300DestroyCmdBuf(r300ContextPtr r300);
+ 
+-/**
+- * Make sure that enough space is available in the command buffer
+- * by flushing if necessary.
+- *
+- * \param dwords The number of dwords we need to be free on the command buffer
+- */
+-static INLINE void r300EnsureCmdBufSpace(r300ContextPtr r300,
+-					     int dwords, const char *caller)
+-{
+-	assert(dwords < r300->cmdbuf.size);
+-
+-	if (r300->cmdbuf.count_used + dwords > r300->cmdbuf.size)
+-		r300FlushCmdBuf(r300, caller);
+-}
+-
+-/**
+- * Allocate the given number of dwords in the command buffer and return
+- * a pointer to the allocated area.
+- * When necessary, these functions cause a flush. r300AllocCmdBuf() also
+- * causes state reemission after a flush. This is necessary to ensure
+- * correct hardware state after an unlock.
+- */
+-static INLINE uint32_t *r300RawAllocCmdBuf(r300ContextPtr r300,
+-					       int dwords, const char *caller)
+-{
+-	uint32_t *ptr;
+-
+-	r300EnsureCmdBufSpace(r300, dwords, caller);
+-
+-	ptr = &r300->cmdbuf.cmd_buf[r300->cmdbuf.count_used];
+-	r300->cmdbuf.count_used += dwords;
+-	return ptr;
+-}
+-
+-static INLINE uint32_t *r300AllocCmdBuf(r300ContextPtr r300,
+-					    int dwords, const char *caller)
+-{
+-	uint32_t *ptr;
+-
+-	r300EnsureCmdBufSpace(r300, dwords, caller);
+-
+-	if (!r300->cmdbuf.count_used) {
+-		if (RADEON_DEBUG & DEBUG_IOCTL)
+-			fprintf(stderr,
+-				"Reemit state after flush (from %s)\n", caller);
+-		r300EmitState(r300);
+-	}
+-
+-	ptr = &r300->cmdbuf.cmd_buf[r300->cmdbuf.count_used];
+-	r300->cmdbuf.count_used += dwords;
+-	return ptr;
+-}
+ 
+-extern void r300EmitBlit(r300ContextPtr rmesa,
+-			 GLuint color_fmt,
+-			 GLuint src_pitch,
+-			 GLuint src_offset,
+-			 GLuint dst_pitch,
+-			 GLuint dst_offset,
+-			 GLint srcx, GLint srcy,
+-			 GLint dstx, GLint dsty, GLuint w, GLuint h);
++void emit_vpu(GLcontext *ctx, struct radeon_state_atom * atom);
++int check_vpu(GLcontext *ctx, struct radeon_state_atom *atom);
+ 
+-extern void r300EmitWait(r300ContextPtr rmesa, GLuint flags);
+-extern void r300EmitLOAD_VBPNTR(r300ContextPtr rmesa, int start);
+-extern void r300EmitVertexShader(r300ContextPtr rmesa);
+-extern void r300EmitPixelShader(r300ContextPtr rmesa);
++void emit_r500fp(GLcontext *ctx, struct radeon_state_atom * atom);
++int check_r500fp(GLcontext *ctx, struct radeon_state_atom *atom);
++int check_r500fp_const(GLcontext *ctx, struct radeon_state_atom *atom);
+ 
+ #endif				/* __R300_CMDBUF_H__ */
+diff --git a/src/mesa/drivers/dri/r300/r300_context.c b/src/mesa/drivers/dri/r300/r300_context.c
+index 3743627..5d497ef 100644
+--- a/src/mesa/drivers/dri/r300/r300_context.c
++++ b/src/mesa/drivers/dri/r300/r300_context.c
+@@ -44,6 +44,7 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ #include "main/extensions.h"
+ #include "main/state.h"
+ #include "main/bufferobj.h"
++#include "main/texobj.h"
+ 
+ #include "swrast/swrast.h"
+ #include "swrast_setup/swrast_setup.h"
+@@ -55,19 +56,17 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ #include "drivers/common/driverfuncs.h"
+ 
+-#include "radeon_ioctl.h"
+-#include "radeon_span.h"
+ #include "r300_context.h"
++#include "radeon_context.h"
++#include "radeon_span.h"
+ #include "r300_cmdbuf.h"
+ #include "r300_state.h"
+ #include "r300_ioctl.h"
+ #include "r300_tex.h"
+ #include "r300_emit.h"
+ #include "r300_swtcl.h"
++#include "radeon_bocs_wrapper.h"
+ 
+-#ifdef USER_BUFFERS
+-#include "r300_mem.h"
+-#endif
+ 
+ #include "vblank.h"
+ #include "utils.h"
+@@ -77,19 +76,17 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ int future_hw_tcl_on = 1;
+ int hw_tcl_on = 1;
+ 
+-#define need_GL_EXT_stencil_two_side
+-#define need_GL_ARB_multisample
++#define need_GL_VERSION_2_0
+ #define need_GL_ARB_point_parameters
+-#define need_GL_ARB_texture_compression
+-#define need_GL_ARB_vertex_buffer_object
+ #define need_GL_ARB_vertex_program
+-#define need_GL_EXT_blend_minmax
+-//#define need_GL_EXT_fog_coord
+-#define need_GL_EXT_multi_draw_arrays
+-#define need_GL_EXT_secondary_color
+ #define need_GL_EXT_blend_equation_separate
+ #define need_GL_EXT_blend_func_separate
++#define need_GL_EXT_blend_minmax
++//#define need_GL_EXT_fog_coord
+ #define need_GL_EXT_gpu_program_parameters
++#define need_GL_EXT_secondary_color
++#define need_GL_EXT_stencil_two_side
++#define need_GL_ATI_separate_stencil
+ #define need_GL_NV_vertex_program
+ #include "extension_helper.h"
+ 
+@@ -97,27 +94,23 @@ const struct dri_extension card_extensions[] = {
+   /* *INDENT-OFF* */
+   {"GL_ARB_depth_texture",		NULL},
+   {"GL_ARB_fragment_program",		NULL},
+-  {"GL_ARB_multisample",		GL_ARB_multisample_functions},
+   {"GL_ARB_multitexture",		NULL},
+   {"GL_ARB_point_parameters",		GL_ARB_point_parameters_functions},
+   {"GL_ARB_shadow",			NULL},
+   {"GL_ARB_shadow_ambient",		NULL},
+   {"GL_ARB_texture_border_clamp",	NULL},
+-  {"GL_ARB_texture_compression",	GL_ARB_texture_compression_functions},
+   {"GL_ARB_texture_cube_map",		NULL},
+   {"GL_ARB_texture_env_add",		NULL},
+   {"GL_ARB_texture_env_combine",	NULL},
+   {"GL_ARB_texture_env_crossbar",	NULL},
+   {"GL_ARB_texture_env_dot3",		NULL},
+   {"GL_ARB_texture_mirrored_repeat",	NULL},
+-  {"GL_ARB_vertex_buffer_object",	GL_ARB_vertex_buffer_object_functions},
+   {"GL_ARB_vertex_program",		GL_ARB_vertex_program_functions},
+   {"GL_EXT_blend_equation_separate",	GL_EXT_blend_equation_separate_functions},
+   {"GL_EXT_blend_func_separate",	GL_EXT_blend_func_separate_functions},
+   {"GL_EXT_blend_minmax",		GL_EXT_blend_minmax_functions},
+   {"GL_EXT_blend_subtract",		NULL},
+ //  {"GL_EXT_fog_coord",			GL_EXT_fog_coord_functions },
+-  {"GL_EXT_multi_draw_arrays",		GL_EXT_multi_draw_arrays_functions},
+   {"GL_EXT_gpu_program_parameters",     GL_EXT_gpu_program_parameters_functions},
+   {"GL_EXT_secondary_color", 		GL_EXT_secondary_color_functions},
+   {"GL_EXT_shadow_funcs",		NULL},
+@@ -130,6 +123,7 @@ const struct dri_extension card_extensions[] = {
+   {"GL_EXT_texture_lod_bias",		NULL},
+   {"GL_EXT_texture_mirror_clamp",	NULL},
+   {"GL_EXT_texture_rectangle",		NULL},
++  {"GL_ATI_separate_stencil",		GL_ATI_separate_stencil_functions},
+   {"GL_ATI_texture_env_combine3",	NULL},
+   {"GL_ATI_texture_mirror_once",	NULL},
+   {"GL_MESA_pack_invert",		NULL},
+@@ -142,6 +136,16 @@ const struct dri_extension card_extensions[] = {
+   /* *INDENT-ON* */
+ };
+ 
++
++/**
++ * The GL 2.0 functions are needed to make display lists work with
++ * functions added by GL_ATI_separate_stencil.
++ */
++const struct dri_extension gl_20_extension[] = {
++  {"GL_VERSION_2_0",			GL_VERSION_2_0_functions },
++};
++
++
+ extern struct tnl_pipeline_stage _r300_render_stage;
+ extern const struct tnl_pipeline_stage _r300_tcl_stage;
+ 
+@@ -178,6 +182,82 @@ static const struct tnl_pipeline_stage *r300_pipeline[] = {
+ 	0,
+ };
+ 
++static void r300RunPipeline(GLcontext * ctx)
 +{
-+    struct bo_manager_legacy *boml;
++    _mesa_lock_context_textures(ctx);
 +
-+    if (bo_legacy == NULL) {
-+        return;
-+    }
-+    boml = (struct bo_manager_legacy *)bo_legacy->base.bom;
-+    bo_legacy->prev->next = bo_legacy->next;
-+    if (bo_legacy->next) {
-+        bo_legacy->next->prev = bo_legacy->prev;
-+    }
-+    if (!bo_legacy->static_bo) {
-+        legacy_free_handle(boml, bo_legacy->base.handle);
-+        if (bo_legacy->base.domains & RADEON_GEM_DOMAIN_GTT) {
-+            /* dma buffers */
-+            bo_dma_free(&bo_legacy->base);
-+        } else {
-+  	    driDestroyTextureObject(&bo_legacy->tobj->base);
-+	    bo_legacy->tobj = NULL;
-+            /* free backing store */
-+            free(bo_legacy->ptr);
-+        }
-+    }
-+    memset(bo_legacy, 0 , sizeof(struct bo_legacy));
-+    free(bo_legacy);
++    if (ctx->NewState)
++        _mesa_update_state_locked(ctx);
++    
++    _tnl_run_pipeline(ctx);
++    _mesa_unlock_context_textures(ctx);
 +}
 +
-+static struct radeon_bo *bo_open(struct radeon_bo_manager *bom,
-+                                 uint32_t handle,
-+                                 uint32_t size,
-+                                 uint32_t alignment,
-+                                 uint32_t domains,
-+                                 uint32_t flags)
++static void r300_get_lock(radeonContextPtr rmesa)
 +{
-+    struct bo_manager_legacy *boml = (struct bo_manager_legacy *)bom;
-+    struct bo_legacy *bo_legacy;
-+    int r;
-+
-+    if (handle) {
-+        bo_legacy = boml->bos.next;
-+        while (bo_legacy) {
-+            if (bo_legacy->base.handle == handle) {
-+                radeon_bo_ref(&(bo_legacy->base));
-+                return (struct radeon_bo*)bo_legacy;
-+            }
-+            bo_legacy = bo_legacy->next;
-+        }
-+        return NULL;
-+    }
++	drm_radeon_sarea_t *sarea = rmesa->sarea;
 +
-+    bo_legacy = bo_allocate(boml, size, alignment, domains, flags);
-+    bo_legacy->static_bo = 0;
-+    r = legacy_new_handle(boml, &bo_legacy->base.handle);
-+    if (r) {
-+        bo_free(bo_legacy);
-+        return NULL;
-+    }
-+    if (bo_legacy->base.domains & RADEON_GEM_DOMAIN_GTT) {
-+    retry:
-+        legacy_track_pending(boml, 0);
-+        /* dma buffers */
++	if (sarea->ctx_owner != rmesa->dri.hwContext) {
++		sarea->ctx_owner = rmesa->dri.hwContext;
++		if (!rmesa->radeonScreen->kernel_mm)
++			radeon_bo_legacy_texture_age(rmesa->radeonScreen->bom);
++	}
++}		  
 +
-+        r = bo_dma_alloc(&(bo_legacy->base));
-+        if (r) {
-+	  if (legacy_wait_any_pending(boml) == -1) {
-+            bo_free(bo_legacy);
-+	    return NULL;
-+	  }
-+	  goto retry;
-+	  return NULL;
-+        }
-+    } else {
-+        bo_legacy->ptr = malloc(bo_legacy->base.size);
-+        if (bo_legacy->ptr == NULL) {
-+            bo_free(bo_legacy);
-+            return NULL;
-+        }
-+    }
-+    radeon_bo_ref(&(bo_legacy->base));
-+    return (struct radeon_bo*)bo_legacy;
++static void r300_vtbl_emit_cs_header(struct radeon_cs *cs, radeonContextPtr rmesa)
++{
++    /* please flush pipe do all pending work */
++    radeon_cs_write_dword(cs, cmdpacket0(rmesa->radeonScreen,
++                                  R300_SC_SCREENDOOR, 1));
++    radeon_cs_write_dword(cs, 0x0);
++    radeon_cs_write_dword(cs, cmdpacket0(rmesa->radeonScreen,
++                                  R300_SC_SCREENDOOR, 1));
++    radeon_cs_write_dword(cs, 0x00FFFFFF);
++    radeon_cs_write_dword(cs, cmdpacket0(rmesa->radeonScreen,
++                                  R300_SC_HYPERZ, 1));
++    radeon_cs_write_dword(cs, 0x0);
++    radeon_cs_write_dword(cs, cmdpacket0(rmesa->radeonScreen,
++                                  R300_US_CONFIG, 1));
++    radeon_cs_write_dword(cs, 0x0);
++    radeon_cs_write_dword(cs, cmdpacket0(rmesa->radeonScreen,
++                                  R300_ZB_CNTL, 1));
++    radeon_cs_write_dword(cs, 0x0);
++    radeon_cs_write_dword(cs, cmdwait(rmesa->radeonScreen, R300_WAIT_3D));
++    radeon_cs_write_dword(cs, cmdpacket0(rmesa->radeonScreen,
++                                  R300_RB3D_DSTCACHE_CTLSTAT, 1));
++    radeon_cs_write_dword(cs, R300_RB3D_DSTCACHE_CTLSTAT_DC_FLUSH_FLUSH_DIRTY_3D);
++    radeon_cs_write_dword(cs, cmdpacket0(rmesa->radeonScreen,
++                                  R300_ZB_ZCACHE_CTLSTAT, 1));
++    radeon_cs_write_dword(cs, R300_ZB_ZCACHE_CTLSTAT_ZC_FLUSH_FLUSH_AND_FREE);
++    radeon_cs_write_dword(cs, cmdwait(rmesa->radeonScreen,
++                               R300_WAIT_3D | R300_WAIT_3D_CLEAN));
 +}
 +
-+static void bo_ref(struct radeon_bo *bo)
++static void r300_vtbl_pre_emit_atoms(radeonContextPtr radeon)
 +{
++   r300ContextPtr r300 = (r300ContextPtr)radeon;
++   BATCH_LOCALS(radeon);
++
++   r300->vap_flush_needed = GL_TRUE;
++
++   cp_wait(radeon, R300_WAIT_3D | R300_WAIT_3D_CLEAN);
++   BEGIN_BATCH_NO_AUTOSTATE(2);
++   OUT_BATCH_REGVAL(R300_TX_INVALTAGS, R300_TX_FLUSH);
++   END_BATCH();
++   end_3d(radeon);
 +}
 +
-+static struct radeon_bo *bo_unref(struct radeon_bo *bo)
++static void r300_init_vtbl(radeonContextPtr radeon)
 +{
-+    struct bo_legacy *bo_legacy = (struct bo_legacy*)bo;
++   radeon->vtbl.get_lock = r300_get_lock;
++   radeon->vtbl.update_viewport_offset = r300UpdateViewportOffset;
++   radeon->vtbl.update_draw_buffer = r300UpdateDrawBuffer;
++   radeon->vtbl.emit_cs_header = r300_vtbl_emit_cs_header;
++   radeon->vtbl.swtcl_flush = r300_swtcl_flush;
++   radeon->vtbl.pre_emit_atoms = r300_vtbl_pre_emit_atoms;
++}
 +
-+    if (bo->cref <= 0) {
-+        bo_legacy->prev->next = bo_legacy->next;
-+        if (bo_legacy->next) {
-+            bo_legacy->next->prev = bo_legacy->prev;
-+        }
-+        if (!bo_legacy->is_pending) {
-+            bo_free(bo_legacy);
++
+ /* Create the device specific rendering context.
+  */
+ GLboolean r300CreateContext(const __GLcontextModes * glVisual,
+@@ -189,7 +269,7 @@ GLboolean r300CreateContext(const __GLcontextModes * glVisual,
+ 	struct dd_function_table functions;
+ 	r300ContextPtr r300;
+ 	GLcontext *ctx;
+-	int tcl_mode, i;
++	int tcl_mode;
+ 
+ 	assert(glVisual);
+ 	assert(driContextPriv);
+@@ -203,13 +283,14 @@ GLboolean r300CreateContext(const __GLcontextModes * glVisual,
+ 	if (!(screen->chip_flags & RADEON_CHIPSET_TCL))
+ 		hw_tcl_on = future_hw_tcl_on = 0;
+ 
++	r300_init_vtbl(&r300->radeon);
+ 	/* Parse configuration files.
+ 	 * Do this here so that initialMaxAnisotropy is set before we create
+ 	 * the default textures.
+ 	 */
+ 	driParseConfigFiles(&r300->radeon.optionCache, &screen->optionCache,
+ 			    screen->driScreen->myNum, "r300");
+-	r300->initialMaxAnisotropy = driQueryOptionf(&r300->radeon.optionCache,
++	r300->radeon.initialMaxAnisotropy = driQueryOptionf(&r300->radeon.optionCache,
+ 						     "def_max_anisotropy");
+ 
+ 	/* Init default driver functions then plug in our R300-specific functions
+@@ -221,10 +302,6 @@ GLboolean r300CreateContext(const __GLcontextModes * glVisual,
+ 	r300InitTextureFuncs(&functions);
+ 	r300InitShaderFuncs(&functions);
+ 
+-#ifdef USER_BUFFERS
+-	r300_mem_init(r300);
+-#endif
+-
+ 	if (!radeonInitContext(&r300->radeon, &functions,
+ 			       glVisual, driContextPriv,
+ 			       sharedContextPrivate)) {
+@@ -233,37 +310,10 @@ GLboolean r300CreateContext(const __GLcontextModes * glVisual,
+ 	}
+ 
+ 	/* Init r300 context data */
+-	r300->dma.buf0_address =
+-	    r300->radeon.radeonScreen->buffers->list[0].address;
+-
+-	(void)memset(r300->texture_heaps, 0, sizeof(r300->texture_heaps));
+-	make_empty_list(&r300->swapped);
+-
+-	r300->nr_heaps = 1 /* screen->numTexHeaps */ ;
+-	assert(r300->nr_heaps < RADEON_NR_TEX_HEAPS);
+-	for (i = 0; i < r300->nr_heaps; i++) {
+-		/* *INDENT-OFF* */
+-		r300->texture_heaps[i] = driCreateTextureHeap(i, r300,
+-							       screen->
+-							       texSize[i], 12,
+-							       RADEON_NR_TEX_REGIONS,
+-							       (drmTextureRegionPtr)
+-							       r300->radeon.sarea->
+-							       tex_list[i],
+-							       &r300->radeon.sarea->
+-							       tex_age[i],
+-							       &r300->swapped,
+-							       sizeof
+-							       (r300TexObj),
+-							       (destroy_texture_object_t
+-								*)
+-							       r300DestroyTexObj);
+-		/* *INDENT-ON* */
+-	}
+-	r300->texture_depth = driQueryOptioni(&r300->radeon.optionCache,
++	r300->radeon.texture_depth = driQueryOptioni(&r300->radeon.optionCache,
+ 					      "texture_depth");
+-	if (r300->texture_depth == DRI_CONF_TEXTURE_DEPTH_FB)
+-		r300->texture_depth = (screen->cpp == 4) ?
++	if (r300->radeon.texture_depth == DRI_CONF_TEXTURE_DEPTH_FB)
++		r300->radeon.texture_depth = (screen->cpp == 4) ?
+ 		    DRI_CONF_TEXTURE_DEPTH_32 : DRI_CONF_TEXTURE_DEPTH_16;
+ 
+ 	/* Set the maximum texture size small enough that we can guarentee that
+@@ -298,13 +348,13 @@ GLboolean r300CreateContext(const __GLcontextModes * glVisual,
+ 	ctx->Const.MaxLineWidth = R300_LINESIZE_MAX;
+ 	ctx->Const.MaxLineWidthAA = R300_LINESIZE_MAX;
+ 
+-#ifdef USER_BUFFERS
+ 	/* Needs further modifications */
+ #if 0
+ 	ctx->Const.MaxArrayLockSize =
+ 	    ( /*512 */ RADEON_BUFFER_SIZE * 16 * 1024) / (4 * 4);
+ #endif
+-#endif
++
++	ctx->Const.MaxDrawBuffers = 1;
+ 
+ 	/* Initialize the software rasterizer and helper modules.
+ 	 */
+@@ -377,13 +427,13 @@ GLboolean r300CreateContext(const __GLcontextModes * glVisual,
+ 	    driQueryOptionb(&r300->radeon.optionCache,
+ 			    "disable_lowimpact_fallback");
+ 
+-	radeonInitSpanFuncs(ctx);
++   	radeonInitSpanFuncs( ctx );
+ 	r300InitCmdBuf(r300);
+ 	r300InitState(r300);
+ 	if (!(screen->chip_flags & RADEON_CHIPSET_TCL))
+ 	        r300InitSwtcl(ctx);
+ 
+-	TNL_CONTEXT(ctx)->Driver.RunPipeline = _tnl_run_pipeline;
++	TNL_CONTEXT(ctx)->Driver.RunPipeline = r300RunPipeline;
+ 
+ 	tcl_mode = driQueryOptioni(&r300->radeon.optionCache, "tcl_mode");
+ 	if (driQueryOptionb(&r300->radeon.optionCache, "no_rast")) {
+@@ -406,72 +456,6 @@ GLboolean r300CreateContext(const __GLcontextModes * glVisual,
+ 	return GL_TRUE;
+ }
+ 
+-static void r300FreeGartAllocations(r300ContextPtr r300)
+-{
+-	int i, ret, tries = 0, done_age, in_use = 0;
+-	drm_radeon_mem_free_t memfree;
+-
+-	memfree.region = RADEON_MEM_REGION_GART;
+-
+-#ifdef USER_BUFFERS
+-	for (i = r300->rmm->u_last; i > 0; i--) {
+-		if (r300->rmm->u_list[i].ptr == NULL) {
+-			continue;
+-		}
+-
+-		/* check whether this buffer is still in use */
+-		if (r300->rmm->u_list[i].pending) {
+-			in_use++;
+-		}
+-	}
+-	/* Cannot flush/lock if no context exists. */
+-	if (in_use)
+-		r300FlushCmdBuf(r300, __FUNCTION__);
+-
+-	done_age = radeonGetAge((radeonContextPtr) r300);
+-
+-	for (i = r300->rmm->u_last; i > 0; i--) {
+-		if (r300->rmm->u_list[i].ptr == NULL) {
+-			continue;
+-		}
+-
+-		/* check whether this buffer is still in use */
+-		if (!r300->rmm->u_list[i].pending) {
+-			continue;
+-		}
+-
+-		assert(r300->rmm->u_list[i].h_pending == 0);
+-
+-		tries = 0;
+-		while (r300->rmm->u_list[i].age > done_age && tries++ < 1000) {
+-			usleep(10);
+-			done_age = radeonGetAge((radeonContextPtr) r300);
+-		}
+-		if (tries >= 1000) {
+-			WARN_ONCE("Failed to idle region!");
+-		}
+-
+-		memfree.region_offset = (char *)r300->rmm->u_list[i].ptr -
+-		    (char *)r300->radeon.radeonScreen->gartTextures.map;
+-
+-		ret = drmCommandWrite(r300->radeon.radeonScreen->driScreen->fd,
+-				      DRM_RADEON_FREE, &memfree,
+-				      sizeof(memfree));
+-		if (ret) {
+-			fprintf(stderr, "Failed to free at %p\nret = %s\n",
+-				r300->rmm->u_list[i].ptr, strerror(-ret));
+-		} else {
+-			if (i == r300->rmm->u_last)
+-				r300->rmm->u_last--;
+-
+-			r300->rmm->u_list[i].pending = 0;
+-			r300->rmm->u_list[i].ptr = NULL;
+-		}
+-	}
+-	r300->rmm->u_head = i;
+-#endif				/* USER_BUFFERS */
+-}
+-
+ /* Destroy the device specific context.
+  */
+ void r300DestroyContext(__DRIcontextPrivate * driContextPriv)
+@@ -495,55 +479,27 @@ void r300DestroyContext(__DRIcontextPrivate * driContextPriv)
+ 	assert(r300);		/* should never be null */
+ 
+ 	if (r300) {
+-		GLboolean release_texture_heaps;
+-
+-		release_texture_heaps =
+-		    (r300->radeon.glCtx->Shared->RefCount == 1);
+ 		_swsetup_DestroyContext(r300->radeon.glCtx);
+ 		_tnl_DestroyContext(r300->radeon.glCtx);
+ 		_vbo_DestroyContext(r300->radeon.glCtx);
+ 		_swrast_DestroyContext(r300->radeon.glCtx);
+ 
+-		if (r300->dma.current.buf) {
+-			r300ReleaseDmaRegion(r300, &r300->dma.current,
+-					     __FUNCTION__);
+-#ifndef USER_BUFFERS
+-			r300FlushCmdBuf(r300, __FUNCTION__);
+-#endif
+-		}
+-		r300FreeGartAllocations(r300);
+-		r300DestroyCmdBuf(r300);
++		rcommonFlushCmdBuf(&r300->radeon, __FUNCTION__);
+ 
+ 		if (radeon->state.scissor.pClipRects) {
+ 			FREE(radeon->state.scissor.pClipRects);
+ 			radeon->state.scissor.pClipRects = NULL;
+ 		}
+ 
+-		if (release_texture_heaps) {
+-			/* This share group is about to go away, free our private
+-			 * texture object data.
+-			 */
+-			int i;
+-
+-			for (i = 0; i < r300->nr_heaps; i++) {
+-				driDestroyTextureHeap(r300->texture_heaps[i]);
+-				r300->texture_heaps[i] = NULL;
+-			}
+-
+-			assert(is_empty_list(&r300->swapped));
+-		}
++		r300DestroyCmdBuf(r300);
+ 
+ 		radeonCleanupContext(&r300->radeon);
+ 
+-#ifdef USER_BUFFERS
++
+ 		/* the memory manager might be accessed when Mesa frees the shared
+ 		 * state, so don't destroy it earlier
+ 		 */
+-		r300_mem_destroy(r300);
+-#endif
+ 
+-		/* free the option cache */
+-		driDestroyOptionCache(&r300->radeon.optionCache);
+ 
+ 		FREE(r300);
+ 	}
+diff --git a/src/mesa/drivers/dri/r300/r300_context.h b/src/mesa/drivers/dri/r300/r300_context.h
+index c15e9fa..37718f5 100644
+--- a/src/mesa/drivers/dri/r300/r300_context.h
++++ b/src/mesa/drivers/dri/r300/r300_context.h
+@@ -42,21 +42,20 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ #include "radeon_drm.h"
+ #include "dri_util.h"
+ #include "texmem.h"
++#include "radeon_common.h"
+ 
+ #include "main/macros.h"
+ #include "main/mtypes.h"
+ #include "main/colormac.h"
+ 
+-#define USER_BUFFERS
+-
+ struct r300_context;
+ typedef struct r300_context r300ContextRec;
+ typedef struct r300_context *r300ContextPtr;
+ 
+-#include "radeon_lock.h"
++
+ #include "main/mm.h"
+ 
+-/* From http://gcc.gnu.org/onlinedocs/gcc-3.2.3/gcc/Variadic-Macros.html .
++/* From http://gcc. gnu.org/onlinedocs/gcc-3.2.3/gcc/Variadic-Macros.html .
+    I suppose we could inline this and use macro to fetch out __LINE__ and stuff in case we run into trouble
+    with other compilers ... GLUE!
+ */
+@@ -75,174 +74,19 @@ typedef struct r300_context *r300ContextPtr;
+ #include "r300_vertprog.h"
+ #include "r500_fragprog.h"
+ 
+-/**
+- * This function takes a float and packs it into a uint32_t
+- */
+-static INLINE uint32_t r300PackFloat32(float fl)
+-{
+-	union {
+-		float fl;
+-		uint32_t u;
+-	} u;
+-
+-	u.fl = fl;
+-	return u.u;
+-}
+-
+-/* This is probably wrong for some values, I need to test this
+- * some more.  Range checking would be a good idea also..
+- *
+- * But it works for most things.  I'll fix it later if someone
+- * else with a better clue doesn't
+- */
+-static INLINE uint32_t r300PackFloat24(float f)
+-{
+-	float mantissa;
+-	int exponent;
+-	uint32_t float24 = 0;
+-
+-	if (f == 0.0)
+-		return 0;
+ 
+-	mantissa = frexpf(f, &exponent);
+-
+-	/* Handle -ve */
+-	if (mantissa < 0) {
+-		float24 |= (1 << 23);
+-		mantissa = mantissa * -1.0;
+-	}
+-	/* Handle exponent, bias of 63 */
+-	exponent += 62;
+-	float24 |= (exponent << 16);
+-	/* Kill 7 LSB of mantissa */
+-	float24 |= (r300PackFloat32(mantissa) & 0x7FFFFF) >> 7;
+-
+-	return float24;
+-}
+ 
+ /************ DMA BUFFERS **************/
+ 
+-/* Need refcounting on dma buffers:
+- */
+-struct r300_dma_buffer {
+-	int refcount;		/**< the number of retained regions in buf */
+-	drmBufPtr buf;
+-	int id;
+-};
+-#undef GET_START
+-#ifdef USER_BUFFERS
+-#define GET_START(rvb) (r300GartOffsetFromVirtual(rmesa, (rvb)->address+(rvb)->start))
+-#else
+-#define GET_START(rvb) (rmesa->radeon.radeonScreen->gart_buffer_offset +		\
+-			(rvb)->address - rmesa->dma.buf0_address +	\
+-			(rvb)->start)
+-#endif
+-/* A retained region, eg vertices for indexed vertices.
+- */
+-struct r300_dma_region {
+-	struct r300_dma_buffer *buf;
+-	char *address;		/* == buf->address */
+-	int start, end, ptr;	/* offsets from start of buf */
+-
+-	int aos_offset;		/* address in GART memory */
+-	int aos_stride;		/* distance between elements, in dwords */
+-	int aos_size;		/* number of components (1-4) */
+-};
+-
+-struct r300_dma {
+-	/* Active dma region.  Allocations for vertices and retained
+-	 * regions come from here.  Also used for emitting random vertices,
+-	 * these may be flushed by calling flush_current();
+-	 */
+-	struct r300_dma_region current;
+-
+-	void (*flush) (r300ContextPtr);
+-
+-	char *buf0_address;	/* start of buf[0], for index calcs */
+-
+-	/* Number of "in-flight" DMA buffers, i.e. the number of buffers
+-	 * for which a DISCARD command is currently queued in the command buffer.
+-	 */
+-	GLuint nr_released_bufs;
+-};
+-
+-       /* Texture related */
+-
+-typedef struct r300_tex_obj r300TexObj, *r300TexObjPtr;
+-
+-/* Texture object in locally shared texture space.
+- */
+-struct r300_tex_obj {
+-	driTextureObject base;
+-
+-	GLuint bufAddr;		/* Offset to start of locally
+-				   shared texture block */
+-
+-	drm_radeon_tex_image_t image[6][RADEON_MAX_TEXTURE_LEVELS];
+-	/* Six, for the cube faces */
+-
+-	GLboolean image_override;	/* Image overridden by GLX_EXT_tfp */
+-
+-	GLuint pitch;		/* this isn't sent to hardware just used in calculations */
+-	/* hardware register values */
+-	/* Note that R200 has 8 registers per texture and R300 only 7 */
+-	GLuint filter;
+-	GLuint filter_1;
+-	GLuint pitch_reg;
+-	GLuint size;		/* npot only */
+-	GLuint format;
+-	GLuint offset;		/* Image location in the card's address space.
+-				   All cube faces follow. */
+-	GLuint unknown4;
+-	GLuint unknown5;
+-	/* end hardware registers */
+-
+-	/* registers computed by r200 code - keep them here to
+-	   compare against what is actually written.
+-
+-	   to be removed later.. */
+-	GLuint pp_border_color;
+-	GLuint pp_cubic_faces;	/* cube face 1,2,3,4 log2 sizes */
+-	GLuint format_x;
+-
+-	GLboolean border_fallback;
+-
+-	GLuint tile_bits;	/* hw texture tile bits used on this texture */
+-};
+-
+-struct r300_texture_env_state {
+-	r300TexObjPtr texobj;
+-	GLenum format;
+-	GLenum envMode;
+-};
+-
+ /* The blit width for texture uploads
+  */
+ #define R300_BLIT_WIDTH_BYTES 1024
+ #define R300_MAX_TEXTURE_UNITS 8
+ 
+ struct r300_texture_state {
+-	struct r300_texture_env_state unit[R300_MAX_TEXTURE_UNITS];
+ 	int tc_count;		/* number of incoming texture coordinates from VAP */
+ };
+ 
+-/**
+- * A block of hardware state.
+- *
+- * When check returns non-zero, the returned number of dwords must be
+- * copied verbatim into the command buffer in order to update a state atom
+- * when it is dirty.
+- */
+-struct r300_state_atom {
+-	struct r300_state_atom *next, *prev;
+-	const char *name;	/* for debug */
+-	int cmd_size;		/* maximum size in dwords */
+-	GLuint idx;		/* index in an array (e.g. textures) */
+-	uint32_t *cmd;
+-	GLboolean dirty;
+-
+-	int (*check) (r300ContextPtr, struct r300_state_atom * atom);
+-};
+ 
+ #define R300_VPT_CMD_0		0
+ #define R300_VPT_XSCALE		1
+@@ -459,124 +303,98 @@ struct r300_state_atom {
+  * Cache for hardware register state.
+  */
+ struct r300_hw_state {
+-	struct r300_state_atom atomlist;
+-
+-	GLboolean is_dirty;
+-	GLboolean all_dirty;
+-	int max_state_size;	/* in dwords */
+-
+-	struct r300_state_atom vpt;	/* viewport (1D98) */
+-	struct r300_state_atom vap_cntl;
+-        struct r300_state_atom vap_index_offset; /* 0x208c r5xx only */
+-	struct r300_state_atom vof;	/* VAP output format register 0x2090 */
+-	struct r300_state_atom vte;	/* (20B0) */
+-	struct r300_state_atom vap_vf_max_vtx_indx;	/* Maximum Vertex Indx Clamp (2134) */
+-	struct r300_state_atom vap_cntl_status;
+-	struct r300_state_atom vir[2];	/* vap input route (2150/21E0) */
+-	struct r300_state_atom vic;	/* vap input control (2180) */
+-	struct r300_state_atom vap_psc_sgn_norm_cntl; /* Programmable Stream Control Signed Normalize Control (21DC) */
+-	struct r300_state_atom vap_clip_cntl;
+-	struct r300_state_atom vap_clip;
+-	struct r300_state_atom vap_pvs_vtx_timeout_reg;	/* Vertex timeout register (2288) */
+-	struct r300_state_atom pvs;	/* pvs_cntl (22D0) */
+-	struct r300_state_atom gb_enable;	/* (4008) */
+-	struct r300_state_atom gb_misc;	/* Multisampling position shifts ? (4010) */
+-	struct r300_state_atom ga_point_s0;	/* S Texture Coordinate of Vertex 0 for Point texture stuffing (LLC) (4200) */
+-	struct r300_state_atom ga_triangle_stipple;	/* (4214) */
+-	struct r300_state_atom ps;	/* pointsize (421C) */
+-	struct r300_state_atom ga_point_minmax;	/* (4230) */
+-	struct r300_state_atom lcntl;	/* line control */
+-	struct r300_state_atom ga_line_stipple;	/* (4260) */
+-	struct r300_state_atom shade;
+-	struct r300_state_atom polygon_mode;
+-	struct r300_state_atom fogp;	/* fog parameters (4294) */
+-	struct r300_state_atom ga_soft_reset;	/* (429C) */
+-	struct r300_state_atom zbias_cntl;
+-	struct r300_state_atom zbs;	/* zbias (42A4) */
+-	struct r300_state_atom occlusion_cntl;
+-	struct r300_state_atom cul;	/* cull cntl (42B8) */
+-	struct r300_state_atom su_depth_scale;	/* (42C0) */
+-	struct r300_state_atom rc;	/* rs control (4300) */
+-	struct r300_state_atom ri;	/* rs interpolators (4310) */
+-	struct r300_state_atom rr;	/* rs route (4330) */
+-	struct r300_state_atom sc_hyperz;	/* (43A4) */
+-	struct r300_state_atom sc_screendoor;	/* (43E8) */
+-	struct r300_state_atom fp;	/* fragment program cntl + nodes (4600) */
+-	struct r300_state_atom fpt;	/* texi - (4620) */
+-	struct r300_state_atom us_out_fmt;	/* (46A4) */
+-	struct r300_state_atom r500fp;	/* r500 fp instructions */
+-	struct r300_state_atom r500fp_const;	/* r500 fp constants */
+-	struct r300_state_atom fpi[4];	/* fp instructions (46C0/47C0/48C0/49C0) */
+-	struct r300_state_atom fogs;	/* fog state (4BC0) */
+-	struct r300_state_atom fogc;	/* fog color (4BC8) */
+-	struct r300_state_atom at;	/* alpha test (4BD4) */
+-	struct r300_state_atom fg_depth_src;	/* (4BD8) */
+-	struct r300_state_atom fpp;	/* 0x4C00 and following */
+-	struct r300_state_atom rb3d_cctl;	/* (4E00) */
+-	struct r300_state_atom bld;	/* blending (4E04) */
+-	struct r300_state_atom cmk;	/* colormask (4E0C) */
+-	struct r300_state_atom blend_color;	/* constant blend color */
+-	struct r300_state_atom rop;	/* ropcntl */
+-	struct r300_state_atom cb;	/* colorbuffer (4E28) */
+-	struct r300_state_atom rb3d_dither_ctl;	/* (4E50) */
+-	struct r300_state_atom rb3d_aaresolve_ctl;	/* (4E88) */
+-	struct r300_state_atom rb3d_discard_src_pixel_lte_threshold;	/* (4E88) I saw it only written on RV350 hardware..  */
+-	struct r300_state_atom zs;	/* zstencil control (4F00) */
+-	struct r300_state_atom zstencil_format;
+-	struct r300_state_atom zb;	/* z buffer (4F20) */
+-	struct r300_state_atom zb_depthclearvalue;	/* (4F28) */
+-	struct r300_state_atom unk4F30;	/* (4F30) */
+-	struct r300_state_atom zb_hiz_offset;	/* (4F44) */
+-	struct r300_state_atom zb_hiz_pitch;	/* (4F54) */
+-
+-	struct r300_state_atom vpi;	/* vp instructions */
+-	struct r300_state_atom vpp;	/* vp parameters */
+-	struct r300_state_atom vps;	/* vertex point size (?) */
+-	struct r300_state_atom vpucp[6];	/* vp user clip plane - 6 */
++	struct radeon_state_atom vpt;	/* viewport (1D98) */
++	struct radeon_state_atom vap_cntl;
++        struct radeon_state_atom vap_index_offset; /* 0x208c r5xx only */
++	struct radeon_state_atom vof;	/* VAP output format register 0x2090 */
++	struct radeon_state_atom vte;	/* (20B0) */
++	struct radeon_state_atom vap_vf_max_vtx_indx;	/* Maximum Vertex Indx Clamp (2134) */
++	struct radeon_state_atom vap_cntl_status;
++	struct radeon_state_atom vir[2];	/* vap input route (2150/21E0) */
++	struct radeon_state_atom vic;	/* vap input control (2180) */
++	struct radeon_state_atom vap_psc_sgn_norm_cntl; /* Programmable Stream Control Signed Normalize Control (21DC) */
++	struct radeon_state_atom vap_clip_cntl;
++	struct radeon_state_atom vap_clip;
++	struct radeon_state_atom vap_pvs_vtx_timeout_reg;	/* Vertex timeout register (2288) */
++	struct radeon_state_atom pvs;	/* pvs_cntl (22D0) */
++	struct radeon_state_atom gb_enable;	/* (4008) */
++	struct radeon_state_atom gb_misc;	/* Multisampling position shifts ? (4010) */
++	struct radeon_state_atom ga_point_s0;	/* S Texture Coordinate of Vertex 0 for Point texture stuffing (LLC) (4200) */
++	struct radeon_state_atom ga_triangle_stipple;	/* (4214) */
++	struct radeon_state_atom ps;	/* pointsize (421C) */
++	struct radeon_state_atom ga_point_minmax;	/* (4230) */
++	struct radeon_state_atom lcntl;	/* line control */
++	struct radeon_state_atom ga_line_stipple;	/* (4260) */
++	struct radeon_state_atom shade;
++	struct radeon_state_atom polygon_mode;
++	struct radeon_state_atom fogp;	/* fog parameters (4294) */
++	struct radeon_state_atom ga_soft_reset;	/* (429C) */
++	struct radeon_state_atom zbias_cntl;
++	struct radeon_state_atom zbs;	/* zbias (42A4) */
++	struct radeon_state_atom occlusion_cntl;
++	struct radeon_state_atom cul;	/* cull cntl (42B8) */
++	struct radeon_state_atom su_depth_scale;	/* (42C0) */
++	struct radeon_state_atom rc;	/* rs control (4300) */
++	struct radeon_state_atom ri;	/* rs interpolators (4310) */
++	struct radeon_state_atom rr;	/* rs route (4330) */
++	struct radeon_state_atom sc_hyperz;	/* (43A4) */
++	struct radeon_state_atom sc_screendoor;	/* (43E8) */
++	struct radeon_state_atom fp;	/* fragment program cntl + nodes (4600) */
++	struct radeon_state_atom fpt;	/* texi - (4620) */
++	struct radeon_state_atom us_out_fmt;	/* (46A4) */
++	struct radeon_state_atom r500fp;	/* r500 fp instructions */
++	struct radeon_state_atom r500fp_const;	/* r500 fp constants */
++	struct radeon_state_atom fpi[4];	/* fp instructions (46C0/47C0/48C0/49C0) */
++	struct radeon_state_atom fogs;	/* fog state (4BC0) */
++	struct radeon_state_atom fogc;	/* fog color (4BC8) */
++	struct radeon_state_atom at;	/* alpha test (4BD4) */
++	struct radeon_state_atom fg_depth_src;	/* (4BD8) */
++	struct radeon_state_atom fpp;	/* 0x4C00 and following */
++	struct radeon_state_atom rb3d_cctl;	/* (4E00) */
++	struct radeon_state_atom bld;	/* blending (4E04) */
++	struct radeon_state_atom cmk;	/* colormask (4E0C) */
++	struct radeon_state_atom blend_color;	/* constant blend color */
++	struct radeon_state_atom rop;	/* ropcntl */
++	struct radeon_state_atom cb;	/* colorbuffer (4E28) */
++	struct radeon_state_atom rb3d_dither_ctl;	/* (4E50) */
++	struct radeon_state_atom rb3d_aaresolve_ctl;	/* (4E88) */
++	struct radeon_state_atom rb3d_discard_src_pixel_lte_threshold;	/* (4E88) I saw it only written on RV350 hardware..  */
++	struct radeon_state_atom zs;	/* zstencil control (4F00) */
++	struct radeon_state_atom zstencil_format;
++	struct radeon_state_atom zb;	/* z buffer (4F20) */
++	struct radeon_state_atom zb_depthclearvalue;	/* (4F28) */
++	struct radeon_state_atom unk4F30;	/* (4F30) */
++	struct radeon_state_atom zb_hiz_offset;	/* (4F44) */
++	struct radeon_state_atom zb_hiz_pitch;	/* (4F54) */
++
++	struct radeon_state_atom vpi;	/* vp instructions */
++	struct radeon_state_atom vpp;	/* vp parameters */
++	struct radeon_state_atom vps;	/* vertex point size (?) */
++	struct radeon_state_atom vpucp[6];	/* vp user clip plane - 6 */
+ 	/* 8 texture units */
+ 	/* the state is grouped by function and not by
+ 	   texture unit. This makes single unit updates
+ 	   really awkward - we are much better off
+ 	   updating the whole thing at once */
+ 	struct {
+-		struct r300_state_atom filter;
+-		struct r300_state_atom filter_1;
+-		struct r300_state_atom size;
+-		struct r300_state_atom format;
+-		struct r300_state_atom pitch;
+-		struct r300_state_atom offset;
+-		struct r300_state_atom chroma_key;
+-		struct r300_state_atom border_color;
++		struct radeon_state_atom filter;
++		struct radeon_state_atom filter_1;
++		struct radeon_state_atom size;
++		struct radeon_state_atom format;
++		struct radeon_state_atom pitch;
++		struct radeon_state_atom offset;
++		struct radeon_state_atom chroma_key;
++		struct radeon_state_atom border_color;
+ 	} tex;
+-	struct r300_state_atom txe;	/* tex enable (4104) */
+-};
++	struct radeon_state_atom txe;	/* tex enable (4104) */
+ 
+-/**
+- * This structure holds the command buffer while it is being constructed.
+- *
+- * The first batch of commands in the buffer is always the state that needs
+- * to be re-emitted when the context is lost. This batch can be skipped
+- * otherwise.
+- */
+-struct r300_cmdbuf {
+-	int size;		/* DWORDs allocated for buffer */
+-	uint32_t *cmd_buf;
+-	int count_used;		/* DWORDs filled so far */
+-	int count_reemit;	/* size of re-emission batch */
++	radeonTexObj *textures[R300_MAX_TEXTURE_UNITS];
+ };
+ 
+ /**
+  * State cache
+  */
+ 
+-struct r300_depthbuffer_state {
+-	GLfloat scale;
+-};
+-
+-struct r300_stencilbuffer_state {
+-	GLboolean hw_stencil;
+-};
+-
+ /* Vertex shader state */
+ 
+ /* Perhaps more if we store programs in vmem? */
+@@ -812,22 +630,18 @@ struct r500_fragment_program {
+ #define REG_TEX0	2
+ 
+ struct r300_state {
+-	struct r300_depthbuffer_state depth;
+ 	struct r300_texture_state texture;
+ 	int sw_tcl_inputs[VERT_ATTRIB_MAX];
+ 	struct r300_vertex_shader_state vertex_shader;
+-	struct r300_dma_region aos[R300_MAX_AOS_ARRAYS];
++	struct radeon_aos aos[R300_MAX_AOS_ARRAYS];
+ 	int aos_count;
+ 
+-	GLuint *Elts;
+-	struct r300_dma_region elt_dma;
++	struct radeon_bo *elt_dma_bo; /** Buffer object that contains element indices */
++	int elt_dma_offset; /** Offset into this buffer object, in bytes */
+ 
+-	struct r300_dma_region swtcl_dma;
+ 	DECLARE_RENDERINPUTS(render_inputs_bitset);	/* actual render inputs that R300 was configured for.
+ 							   They are the same as tnl->render_inputs for fixed pipeline */
+ 
+-	struct r300_stencilbuffer_state stencil;
+-
+ };
+ 
+ #define R300_FALLBACK_NONE 0
+@@ -837,41 +651,7 @@ struct r300_state {
+ /* r300_swtcl.c
+  */
+ struct r300_swtcl_info {
+-   GLuint RenderIndex;
+-
+-   /**
+-    * Size of a hardware vertex.  This is calculated when \c ::vertex_attrs is
+-    * installed in the Mesa state vector.
+-    */
+-   GLuint vertex_size;
+-
+-   /**
+-    * Attributes instructing the Mesa TCL pipeline where / how to put vertex
+-    * data in the hardware buffer.
+-    */
+-   struct tnl_attr_map vertex_attrs[VERT_ATTRIB_MAX];
+-
+-   /**
+-    * Number of elements of \c ::vertex_attrs that are actually used.
+-    */
+-   GLuint vertex_attr_count;
+-
+-   /**
+-    * Cached pointer to the buffer where Mesa will store vertex data.
+-    */
+-   GLubyte *verts;
+-
+-   /* Fallback rasterization functions
+-    */
+-  //   r200_point_func draw_point;
+-  //   r200_line_func draw_line;
+-  //   r200_tri_func draw_tri;
+-
+-   GLuint hw_primitive;
+-   GLenum render_primitive;
+-   GLuint numverts;
+-
+-   /**
++  /*
+     * Offset of the 4UB color data within a hardware (swtcl) vertex.
+     */
+    GLuint coloroffset;
+@@ -880,13 +660,6 @@ struct r300_swtcl_info {
+     * Offset of the 3UB specular color data within a hardware (swtcl) vertex.
+     */
+    GLuint specoffset;
+-
+-   /**
+-    * Should Mesa project vertex data or will the hardware do it?
+-    */
+-   GLboolean needproj;
+-
+-   struct r300_dma_region indexed_verts;
+ };
+ 
+ 
+@@ -897,40 +670,22 @@ struct r300_context {
+ 	struct radeon_context radeon;	/* parent class, must be first */
+ 
+ 	struct r300_hw_state hw;
+-	struct r300_cmdbuf cmdbuf;
++
+ 	struct r300_state state;
+ 	struct gl_vertex_program *curr_vp;
+ 	struct r300_vertex_program *selected_vp;
+ 
+ 	/* Vertex buffers
+ 	 */
+-	struct r300_dma dma;
+-	GLboolean save_on_next_unlock;
+-	GLuint NewGLState;
+-
+-	/* Texture object bookkeeping
+-	 */
+-	unsigned nr_heaps;
+-	driTexHeap *texture_heaps[RADEON_NR_TEX_HEAPS];
+-	driTextureObject swapped;
+-	int texture_depth;
+-	float initialMaxAnisotropy;
+-
+-	/* Clientdata textures;
+-	 */
+-	GLuint prefer_gart_client_texturing;
+-
+-#ifdef USER_BUFFERS
+-	struct r300_memory_manager *rmm;
+-#endif
+-
+ 	GLvector4f dummy_attrib[_TNL_ATTRIB_MAX];
+ 	GLvector4f *temp_attrib[_TNL_ATTRIB_MAX];
+ 
+ 	GLboolean disable_lowimpact_fallback;
+ 
+ 	DECLARE_RENDERINPUTS(tnl_index_bitset);	/* index of bits for last tnl_install_attrs */
++	
+ 	struct r300_swtcl_info swtcl;
++	GLboolean vap_flush_needed;
+ };
+ 
+ struct r300_buffer_object {
+@@ -956,4 +711,7 @@ extern int r300VertexProgUpdateParams(GLcontext * ctx,
+ #define RADEON_D_PLAYBACK_RAW 2
+ #define RADEON_D_T 3
+ 
++#define r300PackFloat32 radeonPackFloat32
++#define r300PackFloat24 radeonPackFloat24
++
+ #endif				/* __R300_CONTEXT_H__ */
+diff --git a/src/mesa/drivers/dri/r300/r300_emit.c b/src/mesa/drivers/dri/r300/r300_emit.c
+index 80bd338..1512e90 100644
+--- a/src/mesa/drivers/dri/r300/r300_emit.c
++++ b/src/mesa/drivers/dri/r300/r300_emit.c
+@@ -46,14 +46,10 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ #include "tnl/t_context.h"
+ 
+ #include "r300_context.h"
+-#include "radeon_ioctl.h"
+ #include "r300_state.h"
+ #include "r300_emit.h"
+ #include "r300_ioctl.h"
+ 
+-#ifdef USER_BUFFERS
+-#include "r300_mem.h"
+-#endif
+ 
+ #if SWIZZLE_X != R300_INPUT_ROUTE_SELECT_X || \
+     SWIZZLE_Y != R300_INPUT_ROUTE_SELECT_Y || \
+@@ -66,147 +62,6 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ #define DEBUG_ALL DEBUG_VERTS
+ 
+-#if defined(USE_X86_ASM)
+-#define COPY_DWORDS( dst, src, nr )					\
+-do {									\
+-	int __tmp;							\
+-	__asm__ __volatile__( "rep ; movsl"				\
+-			      : "=%c" (__tmp), "=D" (dst), "=S" (__tmp)	\
+-			      : "0" (nr),				\
+-			        "D" ((long)dst),			\
+-			        "S" ((long)src) );			\
+-} while (0)
+-#else
+-#define COPY_DWORDS( dst, src, nr )		\
+-do {						\
+-   int j;					\
+-   for ( j = 0 ; j < nr ; j++ )			\
+-      dst[j] = ((int *)src)[j];			\
+-   dst += nr;					\
+-} while (0)
+-#endif
+-
+-static void r300EmitVec4(GLcontext * ctx, struct r300_dma_region *rvb,
+-			 GLvoid * data, int stride, int count)
+-{
+-	int i;
+-	int *out = (int *)(rvb->address + rvb->start);
+-
+-	if (RADEON_DEBUG & DEBUG_VERTS)
+-		fprintf(stderr, "%s count %d stride %d out %p data %p\n",
+-			__FUNCTION__, count, stride, (void *)out, (void *)data);
+-
+-	if (stride == 4)
+-		COPY_DWORDS(out, data, count);
+-	else
+-		for (i = 0; i < count; i++) {
+-			out[0] = *(int *)data;
+-			out++;
+-			data += stride;
+-		}
+-}
+-
+-static void r300EmitVec8(GLcontext * ctx, struct r300_dma_region *rvb,
+-			 GLvoid * data, int stride, int count)
+-{
+-	int i;
+-	int *out = (int *)(rvb->address + rvb->start);
+-
+-	if (RADEON_DEBUG & DEBUG_VERTS)
+-		fprintf(stderr, "%s count %d stride %d out %p data %p\n",
+-			__FUNCTION__, count, stride, (void *)out, (void *)data);
+-
+-	if (stride == 8)
+-		COPY_DWORDS(out, data, count * 2);
+-	else
+-		for (i = 0; i < count; i++) {
+-			out[0] = *(int *)data;
+-			out[1] = *(int *)(data + 4);
+-			out += 2;
+-			data += stride;
+-		}
+-}
+-
+-static void r300EmitVec12(GLcontext * ctx, struct r300_dma_region *rvb,
+-			  GLvoid * data, int stride, int count)
+-{
+-	int i;
+-	int *out = (int *)(rvb->address + rvb->start);
+-
+-	if (RADEON_DEBUG & DEBUG_VERTS)
+-		fprintf(stderr, "%s count %d stride %d out %p data %p\n",
+-			__FUNCTION__, count, stride, (void *)out, (void *)data);
+-
+-	if (stride == 12)
+-		COPY_DWORDS(out, data, count * 3);
+-	else
+-		for (i = 0; i < count; i++) {
+-			out[0] = *(int *)data;
+-			out[1] = *(int *)(data + 4);
+-			out[2] = *(int *)(data + 8);
+-			out += 3;
+-			data += stride;
+-		}
+-}
+-
+-static void r300EmitVec16(GLcontext * ctx, struct r300_dma_region *rvb,
+-			  GLvoid * data, int stride, int count)
+-{
+-	int i;
+-	int *out = (int *)(rvb->address + rvb->start);
+-
+-	if (RADEON_DEBUG & DEBUG_VERTS)
+-		fprintf(stderr, "%s count %d stride %d out %p data %p\n",
+-			__FUNCTION__, count, stride, (void *)out, (void *)data);
+-
+-	if (stride == 16)
+-		COPY_DWORDS(out, data, count * 4);
+-	else
+-		for (i = 0; i < count; i++) {
+-			out[0] = *(int *)data;
+-			out[1] = *(int *)(data + 4);
+-			out[2] = *(int *)(data + 8);
+-			out[3] = *(int *)(data + 12);
+-			out += 4;
+-			data += stride;
+-		}
+-}
+-
+-static void r300EmitVec(GLcontext * ctx, struct r300_dma_region *rvb,
+-			GLvoid * data, int size, int stride, int count)
+-{
+-	r300ContextPtr rmesa = R300_CONTEXT(ctx);
+-
+-	if (stride == 0) {
+-		r300AllocDmaRegion(rmesa, rvb, size * 4, 4);
+-		count = 1;
+-		rvb->aos_offset = GET_START(rvb);
+-		rvb->aos_stride = 0;
+-	} else {
+-		r300AllocDmaRegion(rmesa, rvb, size * count * 4, 4);
+-		rvb->aos_offset = GET_START(rvb);
+-		rvb->aos_stride = size;
+-	}
+-
+-	switch (size) {
+-	case 1:
+-		r300EmitVec4(ctx, rvb, data, stride, count);
+-		break;
+-	case 2:
+-		r300EmitVec8(ctx, rvb, data, stride, count);
+-		break;
+-	case 3:
+-		r300EmitVec12(ctx, rvb, data, stride, count);
+-		break;
+-	case 4:
+-		r300EmitVec16(ctx, rvb, data, stride, count);
+-		break;
+-	default:
+-		assert(0);
+-		break;
+-	}
+-}
+-
+ #define DW_SIZE(x) ((inputs[tab[(x)]] << R300_DST_VEC_LOC_SHIFT) |	\
+ 		    (attribptr[tab[(x)]]->size - 1) << R300_DATA_TYPE_0_SHIFT)
+ 
+@@ -314,10 +169,6 @@ GLuint r300VAPOutputCntl0(GLcontext * ctx, GLuint OutputsWritten)
+ 		    R300_VAP_OUTPUT_VTX_FMT_0__COLOR_2_PRESENT |
+ 		    R300_VAP_OUTPUT_VTX_FMT_0__COLOR_3_PRESENT;
+ 
+-#if 0
+-	if (OutputsWritten & (1 << VERT_RESULT_FOGC)) ;
+-#endif
+-
+ 	if (OutputsWritten & (1 << VERT_RESULT_PSIZ))
+ 		ret |= R300_VAP_OUTPUT_VTX_FMT_0__PT_SIZE_PRESENT;
+ 
+@@ -371,7 +222,6 @@ int r300EmitArrays(GLcontext * ctx)
+ 
+ 		assert(RENDERINPUTS_TEST(render_inputs_bitset, _TNL_ATTRIB_POS));
+ 		assert(RENDERINPUTS_TEST(render_inputs_bitset, _TNL_ATTRIB_NORMAL) == 0);
+-		//assert(RENDERINPUTS_TEST(render_inputs_bitset, _TNL_ATTRIB_COLOR0));
+ 
+ 		if (RENDERINPUTS_TEST(render_inputs_bitset, _TNL_ATTRIB_POS)) {
+ 			InputsRead |= 1 << VERT_ATTRIB_POS;
+@@ -433,7 +283,7 @@ int r300EmitArrays(GLcontext * ctx)
+ 	}
+ 
+ 	for (i = 0; i < nr; i++) {
+-		int ci, fix, found = 0;
++		int ci;
+ 
+ 		swizzle[i][0] = SWIZZLE_ZERO;
+ 		swizzle[i][1] = SWIZZLE_ZERO;
+@@ -443,61 +293,35 @@ int r300EmitArrays(GLcontext * ctx)
+ 		for (ci = 0; ci < vb->AttribPtr[tab[i]]->size; ci++) {
+ 			swizzle[i][ci] = ci;
+ 		}
+-
+-		if (r300IsGartMemory(rmesa, vb->AttribPtr[tab[i]]->data, 4)) {
+-			if (vb->AttribPtr[tab[i]]->stride % 4) {
+-				return R300_FALLBACK_TCL;
+-			}
+-			rmesa->state.aos[i].address = (void *)(vb->AttribPtr[tab[i]]->data);
+-			rmesa->state.aos[i].start = 0;
+-			rmesa->state.aos[i].aos_offset = r300GartOffsetFromVirtual(rmesa, vb->AttribPtr[tab[i]]->data);
+-			rmesa->state.aos[i].aos_stride = vb->AttribPtr[tab[i]]->stride / 4;
+-			rmesa->state.aos[i].aos_size = vb->AttribPtr[tab[i]]->size;
+-		} else {
+-			r300EmitVec(ctx, &rmesa->state.aos[i],
++		rcommon_emit_vector(ctx, &rmesa->state.aos[i],
+ 				    vb->AttribPtr[tab[i]]->data,
+ 				    vb->AttribPtr[tab[i]]->size,
+ 				    vb->AttribPtr[tab[i]]->stride, count);
+-		}
+-
+-		rmesa->state.aos[i].aos_size = vb->AttribPtr[tab[i]]->size;
+-
+-		for (fix = 0; fix <= 4 - vb->AttribPtr[tab[i]]->size; fix++) {
+-			if ((rmesa->state.aos[i].aos_offset - _mesa_sizeof_type(GL_FLOAT) * fix) % 4) {
+-				continue;
+-			}
+-			found = 1;
+-			break;
+-		}
+-
+-		if (found) {
+-			if (fix > 0) {
+-				WARN_ONCE("Feeling lucky?\n");
+-			}
+-			rmesa->state.aos[i].aos_offset -= _mesa_sizeof_type(GL_FLOAT) * fix;
+-			for (ci = 0; ci < vb->AttribPtr[tab[i]]->size; ci++) {
+-				swizzle[i][ci] += fix;
+-			}
+-		} else {
+-			WARN_ONCE
+-			    ("Cannot handle offset %x with stride %d, comp %d\n",
+-			     rmesa->state.aos[i].aos_offset,
+-			     rmesa->state.aos[i].aos_stride,
+-			     vb->AttribPtr[tab[i]]->size);
+-			return R300_FALLBACK_TCL;
+-		}
+ 	}
+ 
+ 	/* Setup INPUT_ROUTE. */
+-	R300_STATECHANGE(rmesa, vir[0]);
+-	((drm_r300_cmd_header_t *) rmesa->hw.vir[0].cmd)->packet0.count =
+-	    r300VAPInputRoute0(&rmesa->hw.vir[0].cmd[R300_VIR_CNTL_0],
+-			       vb->AttribPtr, inputs, tab, nr);
+-	R300_STATECHANGE(rmesa, vir[1]);
+-	((drm_r300_cmd_header_t *) rmesa->hw.vir[1].cmd)->packet0.count =
+-	    r300VAPInputRoute1(&rmesa->hw.vir[1].cmd[R300_VIR_CNTL_0], swizzle,
+-			       nr);
+-
++	if (rmesa->radeon.radeonScreen->kernel_mm) {
++		R300_STATECHANGE(rmesa, vir[0]);
++		rmesa->hw.vir[0].cmd[0] &= 0xC000FFFF;
++		rmesa->hw.vir[1].cmd[0] &= 0xC000FFFF;
++		rmesa->hw.vir[0].cmd[0] |=
++			(r300VAPInputRoute0(&rmesa->hw.vir[0].cmd[R300_VIR_CNTL_0],
++					    vb->AttribPtr, inputs, tab, nr) & 0x3FFF) << 16;
++		R300_STATECHANGE(rmesa, vir[1]);
++		rmesa->hw.vir[1].cmd[0] |=
++			(r300VAPInputRoute1(&rmesa->hw.vir[1].cmd[R300_VIR_CNTL_0], swizzle,
++					    nr) & 0x3FFF) << 16;
++	} else {
++		R300_STATECHANGE(rmesa, vir[0]);
++		((drm_r300_cmd_header_t *) rmesa->hw.vir[0].cmd)->packet0.count =
++			r300VAPInputRoute0(&rmesa->hw.vir[0].cmd[R300_VIR_CNTL_0],
++					   vb->AttribPtr, inputs, tab, nr);
++		R300_STATECHANGE(rmesa, vir[1]);
++		((drm_r300_cmd_header_t *) rmesa->hw.vir[1].cmd)->packet0.count =
++			r300VAPInputRoute1(&rmesa->hw.vir[1].cmd[R300_VIR_CNTL_0], swizzle,
++					   nr);
++	}
++	
+ 	/* Setup INPUT_CNTL. */
+ 	R300_STATECHANGE(rmesa, vic);
+ 	rmesa->hw.vic.cmd[R300_VIC_CNTL_0] = r300VAPInputCntl0(ctx, InputsRead);
+@@ -515,45 +339,34 @@ int r300EmitArrays(GLcontext * ctx)
+ 	return R300_FALLBACK_NONE;
+ }
+ 
+-#ifdef USER_BUFFERS
+-void r300UseArrays(GLcontext * ctx)
+-{
+-	r300ContextPtr rmesa = R300_CONTEXT(ctx);
+-	int i;
+-
+-	if (rmesa->state.elt_dma.buf)
+-		r300_mem_use(rmesa, rmesa->state.elt_dma.buf->id);
+-
+-	for (i = 0; i < rmesa->state.aos_count; i++) {
+-		if (rmesa->state.aos[i].buf)
+-			r300_mem_use(rmesa, rmesa->state.aos[i].buf->id);
+-	}
+-}
+-#endif
+-
+ void r300ReleaseArrays(GLcontext * ctx)
+ {
+ 	r300ContextPtr rmesa = R300_CONTEXT(ctx);
+ 	int i;
+ 
+-	r300ReleaseDmaRegion(rmesa, &rmesa->state.elt_dma, __FUNCTION__);
++	if (rmesa->state.elt_dma_bo) {
++		radeon_bo_unref(rmesa->state.elt_dma_bo);
++		rmesa->state.elt_dma_bo = NULL;
++	}
+ 	for (i = 0; i < rmesa->state.aos_count; i++) {
+-		r300ReleaseDmaRegion(rmesa, &rmesa->state.aos[i], __FUNCTION__);
++		if (rmesa->state.aos[i].bo) {
++			radeon_bo_unref(rmesa->state.aos[i].bo);
++			rmesa->state.aos[i].bo = NULL;
++		}
+ 	}
+ }
+ 
+ void r300EmitCacheFlush(r300ContextPtr rmesa)
+ {
+-	int cmd_reserved = 0;
+-	int cmd_written = 0;
+-
+-	drm_radeon_cmd_header_t *cmd = NULL;
+-
+-	reg_start(R300_RB3D_DSTCACHE_CTLSTAT, 0);
+-	e32(R300_RB3D_DSTCACHE_CTLSTAT_DC_FREE_FREE_3D_TAGS |
+-	    R300_RB3D_DSTCACHE_CTLSTAT_DC_FLUSH_FLUSH_DIRTY_3D);
+-
+-	reg_start(R300_ZB_ZCACHE_CTLSTAT, 0);
+-	e32(R300_ZB_ZCACHE_CTLSTAT_ZC_FLUSH_FLUSH_AND_FREE |
+-	    R300_ZB_ZCACHE_CTLSTAT_ZC_FREE_FREE);
++	BATCH_LOCALS(&rmesa->radeon);
++
++	BEGIN_BATCH(4);
++	OUT_BATCH_REGVAL(R300_RB3D_DSTCACHE_CTLSTAT,
++		R300_RB3D_DSTCACHE_CTLSTAT_DC_FREE_FREE_3D_TAGS |
++		R300_RB3D_DSTCACHE_CTLSTAT_DC_FLUSH_FLUSH_DIRTY_3D);
++	OUT_BATCH_REGVAL(R300_ZB_ZCACHE_CTLSTAT,
++		R300_ZB_ZCACHE_CTLSTAT_ZC_FLUSH_FLUSH_AND_FREE |
++		R300_ZB_ZCACHE_CTLSTAT_ZC_FREE_FREE);
++	END_BATCH();
++	COMMIT_BATCH();
+ }
+diff --git a/src/mesa/drivers/dri/r300/r300_emit.h b/src/mesa/drivers/dri/r300/r300_emit.h
+index 89d7383..6bc8f8e 100644
+--- a/src/mesa/drivers/dri/r300/r300_emit.h
++++ b/src/mesa/drivers/dri/r300/r300_emit.h
+@@ -44,28 +44,31 @@
+ #include "r300_cmdbuf.h"
+ #include "radeon_reg.h"
+ 
+-/* TODO: move these defines (and the ones from DRM) into r300_reg.h and sync up
+- * with DRM */
+-#define CP_PACKET0(reg, n)	(RADEON_CP_PACKET0 | ((n)<<16) | ((reg)>>2))
+-#define CP_PACKET3( pkt, n )						\
+-	(RADEON_CP_PACKET3 | (pkt) | ((n) << 16))
+-
+-static INLINE uint32_t cmdpacket0(int reg, int count)
++static INLINE uint32_t cmdpacket0(struct radeon_screen *rscrn,
++                                  int reg, int count)
+ {
+-	drm_r300_cmd_header_t cmd;
+-
+-	cmd.packet0.cmd_type = R300_CMD_PACKET0;
+-	cmd.packet0.count = count;
+-	cmd.packet0.reghi = ((unsigned int)reg & 0xFF00) >> 8;
+-	cmd.packet0.reglo = ((unsigned int)reg & 0x00FF);
+-
+-	return cmd.u;
++    if (!rscrn->kernel_mm) {
++	    drm_r300_cmd_header_t cmd;
++
++	cmd.u = 0;
++    	cmd.packet0.cmd_type = R300_CMD_PACKET0;
++	    cmd.packet0.count = count;
++    	cmd.packet0.reghi = ((unsigned int)reg & 0xFF00) >> 8;
++	    cmd.packet0.reglo = ((unsigned int)reg & 0x00FF);
++
++    	return cmd.u;
++    }
++    if (count) {
++        return CP_PACKET0(reg, count - 1);
++    }
++    return CP_PACKET2;
+ }
+ 
+-static INLINE uint32_t cmdvpu(int addr, int count)
++static INLINE uint32_t cmdvpu(struct radeon_screen *rscrn, int addr, int count)
+ {
+ 	drm_r300_cmd_header_t cmd;
+ 
++	cmd.u = 0;
+ 	cmd.vpu.cmd_type = R300_CMD_VPU;
+ 	cmd.vpu.count = count;
+ 	cmd.vpu.adrhi = ((unsigned int)addr & 0xFF00) >> 8;
+@@ -74,10 +77,12 @@ static INLINE uint32_t cmdvpu(int addr, int count)
+ 	return cmd.u;
+ }
+ 
+-static INLINE uint32_t cmdr500fp(int addr, int count, int type, int clamp)
++static INLINE uint32_t cmdr500fp(struct radeon_screen *rscrn,
++                                 int addr, int count, int type, int clamp)
+ {
+ 	drm_r300_cmd_header_t cmd;
+ 
++	cmd.u = 0;
+ 	cmd.r500fp.cmd_type = R300_CMD_R500FP;
+ 	cmd.r500fp.count = count;
+ 	cmd.r500fp.adrhi_flags = ((unsigned int)addr & 0x100) >> 8;
+@@ -88,169 +93,131 @@ static INLINE uint32_t cmdr500fp(int addr, int count, int type, int clamp)
+ 	return cmd.u;
+ }
+ 
+-static INLINE uint32_t cmdpacket3(int packet)
++static INLINE uint32_t cmdpacket3(struct radeon_screen *rscrn, int packet)
+ {
+ 	drm_r300_cmd_header_t cmd;
+ 
++	cmd.u = 0;
+ 	cmd.packet3.cmd_type = R300_CMD_PACKET3;
+ 	cmd.packet3.packet = packet;
+ 
+ 	return cmd.u;
+ }
+ 
+-static INLINE uint32_t cmdcpdelay(unsigned short count)
++static INLINE uint32_t cmdcpdelay(struct radeon_screen *rscrn,  
++                                  unsigned short count)
+ {
+ 	drm_r300_cmd_header_t cmd;
+ 
++	cmd.u = 0;
++
+ 	cmd.delay.cmd_type = R300_CMD_CP_DELAY;
+ 	cmd.delay.count = count;
+ 
+ 	return cmd.u;
+ }
+ 
+-static INLINE uint32_t cmdwait(unsigned char flags)
++static INLINE uint32_t cmdwait(struct radeon_screen *rscrn,
++                               unsigned char flags)
+ {
+ 	drm_r300_cmd_header_t cmd;
+ 
++	cmd.u = 0;
+ 	cmd.wait.cmd_type = R300_CMD_WAIT;
+ 	cmd.wait.flags = flags;
+ 
+ 	return cmd.u;
+ }
+ 
+-static INLINE uint32_t cmdpacify(void)
++static INLINE uint32_t cmdpacify(struct radeon_screen *rscrn)
+ {
+ 	drm_r300_cmd_header_t cmd;
+ 
++	cmd.u = 0;
+ 	cmd.header.cmd_type = R300_CMD_END3D;
+ 
+ 	return cmd.u;
+ }
+ 
+ /**
+- * Prepare to write a register value to register at address reg.
+- * If num_extra > 0 then the following extra values are written
+- * to registers with address +4, +8 and so on..
+- */
+-#define reg_start(reg, num_extra)					\
+-	do {								\
+-		int _n;							\
+-		_n=(num_extra);						\
+-		cmd = (drm_radeon_cmd_header_t*)			\
+-			r300AllocCmdBuf(rmesa,				\
+-					(_n+2),				\
+-					__FUNCTION__);			\
+-		cmd_reserved=_n+2;					\
+-		cmd_written=1;						\
+-		cmd[0].i=cmdpacket0((reg), _n+1);			\
+-	} while (0);
+-
+-/**
+- * Emit GLuint freestyle
++ * Write the header of a packet3 to the command buffer.
++ * Outputs 2 dwords and expects (num_extra+1) additional dwords afterwards.
+  */
+-#define e32(dword)							\
+-	do {								\
+-		if(cmd_written<cmd_reserved) {				\
+-			cmd[cmd_written].i=(dword);			\
+-			cmd_written++;					\
+-		} else {						\
+-			fprintf(stderr,					\
+-				"e32 but no previous packet "		\
+-				"declaration.\n"			\
+-				"Aborting! in %s::%s at line %d, "	\
+-				"cmd_written=%d cmd_reserved=%d\n",	\
+-				__FILE__, __FUNCTION__, __LINE__,	\
+-				cmd_written, cmd_reserved);		\
+-			_mesa_exit(-1);					\
+-		}							\
++#define OUT_BATCH_PACKET3(packet, num_extra) do {\
++    if (!b_l_rmesa->radeonScreen->kernel_mm) {		\
++    	OUT_BATCH(cmdpacket3(b_l_rmesa->radeonScreen,\
++                  R300_CMD_PACKET3_RAW)); \
++    } else b_l_rmesa->cmdbuf.cs->section_cdw++;\
++	OUT_BATCH(CP_PACKET3((packet), (num_extra))); \
+ 	} while(0)
+ 
+-#define	efloat(f) e32(r300PackFloat32(f))
+-
+-#define vsf_start_fragment(dest, length)				\
+-	do {								\
+-		int _n;							\
+-		_n = (length);						\
+-		cmd = (drm_radeon_cmd_header_t*)			\
+-			r300AllocCmdBuf(rmesa,				\
+-					(_n+1),				\
+-					__FUNCTION__);			\
+-		cmd_reserved = _n+2;					\
+-		cmd_written =1;						\
+-		cmd[0].i = cmdvpu((dest), _n/4);			\
+-	} while (0);
+-
+-#define r500fp_start_fragment(dest, length)				\
+-	do {								\
+-		int _n;							\
+-		_n = (length);						\
+-		cmd = (drm_radeon_cmd_header_t*)			\
+-			r300AllocCmdBuf(rmesa,				\
+-					(_n+1),				\
+-					__FUNCTION__);			\
+-		cmd_reserved = _n+1;					\
+-		cmd_written =1;						\
+-		cmd[0].i = cmdr500fp((dest), _n/6, 0, 0);		\
+-	} while (0);
+-
+-#define start_packet3(packet, count)					\
+-	{								\
+-		int _n;							\
+-		GLuint _p;						\
+-		_n = (count);						\
+-		_p = (packet);						\
+-		cmd = (drm_radeon_cmd_header_t*)			\
+-			r300AllocCmdBuf(rmesa,				\
+-					(_n+3),				\
+-					__FUNCTION__);			\
+-		cmd_reserved = _n+3;					\
+-		cmd_written = 2;					\
+-		if(_n > 0x3fff) {					\
+-			fprintf(stderr,"Too big packet3 %08x: cannot "	\
+-				"store %d dwords\n",			\
+-				_p, _n);				\
+-			_mesa_exit(-1);					\
+-		}							\
+-		cmd[0].i = cmdpacket3(R300_CMD_PACKET3_RAW);		\
+-		cmd[1].i = _p | ((_n & 0x3fff)<<16);			\
+-	}
+-
+ /**
+  * Must be sent to switch to 2d commands
+  */
+-void static INLINE end_3d(r300ContextPtr rmesa)
++void static INLINE end_3d(radeonContextPtr radeon)
+ {
+-	drm_radeon_cmd_header_t *cmd = NULL;
++	BATCH_LOCALS(radeon);
+ 
+-	cmd =
+-	    (drm_radeon_cmd_header_t *) r300AllocCmdBuf(rmesa, 1, __FUNCTION__);
+-	cmd[0].header.cmd_type = R300_CMD_END3D;
++	if (!radeon->radeonScreen->kernel_mm) {
++		BEGIN_BATCH_NO_AUTOSTATE(1);
++		OUT_BATCH(cmdpacify(radeon->radeonScreen));
++		END_BATCH();
++	}
+ }
+ 
+ void static INLINE cp_delay(r300ContextPtr rmesa, unsigned short count)
+ {
+-	drm_radeon_cmd_header_t *cmd = NULL;
++	BATCH_LOCALS(&rmesa->radeon);
+ 
+-	cmd =
+-	    (drm_radeon_cmd_header_t *) r300AllocCmdBuf(rmesa, 1, __FUNCTION__);
+-	cmd[0].i = cmdcpdelay(count);
++	if (!rmesa->radeon.radeonScreen->kernel_mm) {
++		BEGIN_BATCH_NO_AUTOSTATE(1);
++		OUT_BATCH(cmdcpdelay(rmesa->radeon.radeonScreen, count));
++		END_BATCH();
++	}
+ }
+ 
+-void static INLINE cp_wait(r300ContextPtr rmesa, unsigned char flags)
++void static INLINE cp_wait(radeonContextPtr radeon, unsigned char flags)
+ {
+-	drm_radeon_cmd_header_t *cmd = NULL;
+-
+-	cmd =
+-	    (drm_radeon_cmd_header_t *) r300AllocCmdBuf(rmesa, 1, __FUNCTION__);
+-	cmd[0].i = cmdwait(flags);
++	BATCH_LOCALS(radeon);
++	uint32_t wait_until;
++
++	if (!radeon->radeonScreen->kernel_mm) {
++		BEGIN_BATCH_NO_AUTOSTATE(1);
++		OUT_BATCH(cmdwait(radeon->radeonScreen, flags));
++		END_BATCH();
++	} else {
++		switch(flags) {
++		case R300_WAIT_2D:
++			wait_until = (1 << 14);
++			break;
++		case R300_WAIT_3D:
++			wait_until = (1 << 15);
++			break;
++		case R300_NEW_WAIT_2D_3D:
++			wait_until = (1 << 14) | (1 << 15);
++			break;
++		case R300_NEW_WAIT_2D_2D_CLEAN:
++			wait_until = (1 << 14) | (1 << 16) | (1 << 18);
++			break;
++		case R300_NEW_WAIT_3D_3D_CLEAN:
++			wait_until = (1 << 15) | (1 << 17) | (1 << 18);
++			break;
++		case R300_NEW_WAIT_2D_2D_CLEAN_3D_3D_CLEAN:
++			wait_until  = (1 << 14) | (1 << 16) | (1 << 18);
++			wait_until |= (1 << 15) | (1 << 17) | (1 << 18);
++			break;
++		default:
++			return;
++		}
++		BEGIN_BATCH_NO_AUTOSTATE(2);
++		OUT_BATCH(CP_PACKET0(RADEON_WAIT_UNTIL, 0));
++		OUT_BATCH(wait_until);
++		END_BATCH();
++	}
+ }
+ 
+ extern int r300EmitArrays(GLcontext * ctx);
+ 
+-#ifdef USER_BUFFERS
+-void r300UseArrays(GLcontext * ctx);
+-#endif
+-
+ extern void r300ReleaseArrays(GLcontext * ctx);
+ extern int r300PrimitiveType(r300ContextPtr rmesa, int prim);
+ extern int r300NumVerts(r300ContextPtr rmesa, int num_verts, int prim);
+diff --git a/src/mesa/drivers/dri/r300/r300_fragprog.c b/src/mesa/drivers/dri/r300/r300_fragprog.c
+index 4ef7f2b..8d030c6 100644
+--- a/src/mesa/drivers/dri/r300/r300_fragprog.c
++++ b/src/mesa/drivers/dri/r300/r300_fragprog.c
+@@ -163,6 +163,19 @@ static GLboolean transform_TEX(
+ 		}
+ 	}
+ 
++	if (inst.SrcReg[0].File != PROGRAM_TEMPORARY && inst.SrcReg[0].File != PROGRAM_INPUT) {
++		int tmpreg = radeonFindFreeTemporary(t);
++		tgt = radeonAppendInstructions(t->Program, 1);
++		tgt->Opcode = OPCODE_MOV;
++		tgt->DstReg.File = PROGRAM_TEMPORARY;
++		tgt->DstReg.Index = tmpreg;
++		tgt->SrcReg[0] = inst.SrcReg[0];
++
++		reset_srcreg(&inst.SrcReg[0]);
++		inst.SrcReg[0].File = PROGRAM_TEMPORARY;
++		inst.SrcReg[0].Index = tmpreg;
++	}
++	
+ 	tgt = radeonAppendInstructions(t->Program, 1);
+ 	_mesa_copy_instructions(tgt, &inst, 1);
+ 
+diff --git a/src/mesa/drivers/dri/r300/r300_ioctl.c b/src/mesa/drivers/dri/r300/r300_ioctl.c
+index ee85e22..619d268 100644
+--- a/src/mesa/drivers/dri/r300/r300_ioctl.c
++++ b/src/mesa/drivers/dri/r300/r300_ioctl.c
+@@ -46,8 +46,9 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ #include "main/context.h"
+ #include "swrast/swrast.h"
+ 
++#include "radeon_common.h"
++#include "radeon_lock.h"
+ #include "r300_context.h"
+-#include "radeon_ioctl.h"
+ #include "r300_ioctl.h"
+ #include "r300_cmdbuf.h"
+ #include "r300_state.h"
+@@ -55,71 +56,83 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ #include "radeon_reg.h"
+ #include "r300_emit.h"
+ #include "r300_fragprog.h"
++#include "r300_context.h"
+ 
+ #include "vblank.h"
+ 
++#define R200_3D_DRAW_IMMD_2      0xC0003500
++
+ #define CLEARBUFFER_COLOR	0x1
+ #define CLEARBUFFER_DEPTH	0x2
+ #define CLEARBUFFER_STENCIL	0x4
+ 
+-static void r300ClearBuffer(r300ContextPtr r300, int flags, int buffer)
++static void r300ClearBuffer(r300ContextPtr r300, int flags,
++			    struct radeon_renderbuffer *rrb,
++			    struct radeon_renderbuffer *rrbd)
+ {
++	BATCH_LOCALS(&r300->radeon);
+ 	GLcontext *ctx = r300->radeon.glCtx;
+ 	__DRIdrawablePrivate *dPriv = r300->radeon.dri.drawable;
+-	GLuint cboffset, cbpitch;
+-	drm_r300_cmd_header_t *cmd2;
+-	int cmd_reserved = 0;
+-	int cmd_written = 0;
+-	drm_radeon_cmd_header_t *cmd = NULL;
++	GLuint cbpitch = 0;
+ 	r300ContextPtr rmesa = r300;
+ 
+ 	if (RADEON_DEBUG & DEBUG_IOCTL)
+-		fprintf(stderr, "%s: %s buffer (%i,%i %ix%i)\n",
+-			__FUNCTION__, buffer ? "back" : "front",
+-			dPriv->x, dPriv->y, dPriv->w, dPriv->h);
+-
+-	if (buffer) {
+-		cboffset = r300->radeon.radeonScreen->backOffset;
+-		cbpitch = r300->radeon.radeonScreen->backPitch;
+-	} else {
+-		cboffset = r300->radeon.radeonScreen->frontOffset;
+-		cbpitch = r300->radeon.radeonScreen->frontPitch;
++		fprintf(stderr, "%s: buffer %p (%i,%i %ix%i)\n",
++			__FUNCTION__, rrb, dPriv->x, dPriv->y,
++			dPriv->w, dPriv->h);
++
++	if (rrb) {
++		cbpitch = (rrb->pitch / rrb->cpp);
++		if (rrb->cpp == 4)
++			cbpitch |= R300_COLOR_FORMAT_ARGB8888;
++		else
++			cbpitch |= R300_COLOR_FORMAT_RGB565;
++
++		if (rrb->bo->flags & RADEON_BO_FLAGS_MACRO_TILE){
++			cbpitch |= R300_COLOR_TILE_ENABLE;
++        }
+ 	}
+ 
+-	cboffset += r300->radeon.radeonScreen->fbLocation;
+-
+-	cp_wait(r300, R300_WAIT_3D | R300_WAIT_3D_CLEAN);
+-	end_3d(rmesa);
+-
+-	R300_STATECHANGE(r300, cb);
+-	reg_start(R300_RB3D_COLOROFFSET0, 0);
+-	e32(cboffset);
+-
+-	if (r300->radeon.radeonScreen->cpp == 4)
+-		cbpitch |= R300_COLOR_FORMAT_ARGB8888;
+-	else
+-		cbpitch |= R300_COLOR_FORMAT_RGB565;
+-
+-	if (r300->radeon.sarea->tiling_enabled)
+-		cbpitch |= R300_COLOR_TILE_ENABLE;
+-
+-	reg_start(R300_RB3D_COLORPITCH0, 0);
+-	e32(cbpitch);
+-
+-	R300_STATECHANGE(r300, cmk);
+-	reg_start(RB3D_COLOR_CHANNEL_MASK, 0);
++	/* TODO in bufmgr */
++	cp_wait(&r300->radeon, R300_WAIT_3D | R300_WAIT_3D_CLEAN);
++	end_3d(&rmesa->radeon);
+ 
+ 	if (flags & CLEARBUFFER_COLOR) {
+-		e32((ctx->Color.ColorMask[BCOMP] ? RB3D_COLOR_CHANNEL_MASK_BLUE_MASK0 : 0) |
+-		    (ctx->Color.ColorMask[GCOMP] ? RB3D_COLOR_CHANNEL_MASK_GREEN_MASK0 : 0) |
+-		    (ctx->Color.ColorMask[RCOMP] ? RB3D_COLOR_CHANNEL_MASK_RED_MASK0 : 0) |
+-		    (ctx->Color.ColorMask[ACOMP] ? RB3D_COLOR_CHANNEL_MASK_ALPHA_MASK0 : 0));
++		assert(rrb != 0);
++		BEGIN_BATCH_NO_AUTOSTATE(6);
++		OUT_BATCH_REGSEQ(R300_RB3D_COLOROFFSET0, 1);
++		OUT_BATCH_RELOC(0, rrb->bo, 0, 0, RADEON_GEM_DOMAIN_VRAM, 0);
++		OUT_BATCH_REGVAL(R300_RB3D_COLORPITCH0, cbpitch);
++		END_BATCH();
++	}
++#if 1
++	if (flags & (CLEARBUFFER_DEPTH | CLEARBUFFER_STENCIL)) {
++		assert(rrbd != 0);
++		cbpitch = (rrbd->pitch / rrbd->cpp);
++		if (rrbd->bo->flags & RADEON_BO_FLAGS_MACRO_TILE){
++			cbpitch |= R300_DEPTHMACROTILE_ENABLE;
++        }
++		if (rrbd->bo->flags & RADEON_BO_FLAGS_MICRO_TILE){
++            cbpitch |= R300_DEPTHMICROTILE_TILED;
++        }
++		BEGIN_BATCH_NO_AUTOSTATE(6);
++		OUT_BATCH_REGSEQ(R300_ZB_DEPTHOFFSET, 1);
++		OUT_BATCH_RELOC(0, rrbd->bo, 0, 0, RADEON_GEM_DOMAIN_VRAM, 0);
++		OUT_BATCH_REGVAL(R300_ZB_DEPTHPITCH, cbpitch);
++		END_BATCH();
++	}
++#endif
++	BEGIN_BATCH_NO_AUTOSTATE(6);
++	OUT_BATCH_REGSEQ(RB3D_COLOR_CHANNEL_MASK, 1);
++	if (flags & CLEARBUFFER_COLOR) {
++		OUT_BATCH((ctx->Color.ColorMask[BCOMP] ? RB3D_COLOR_CHANNEL_MASK_BLUE_MASK0 : 0) |
++			  (ctx->Color.ColorMask[GCOMP] ? RB3D_COLOR_CHANNEL_MASK_GREEN_MASK0 : 0) |
++			  (ctx->Color.ColorMask[RCOMP] ? RB3D_COLOR_CHANNEL_MASK_RED_MASK0 : 0) |
++			  (ctx->Color.ColorMask[ACOMP] ? RB3D_COLOR_CHANNEL_MASK_ALPHA_MASK0 : 0));
+ 	} else {
+-		e32(0x0);
++		OUT_BATCH(0);
+ 	}
+ 
+-	R300_STATECHANGE(r300, zs);
+-	reg_start(R300_ZB_CNTL, 2);
+ 
+ 	{
+ 		uint32_t t1, t2;
+@@ -146,37 +159,55 @@ static void r300ClearBuffer(r300ContextPtr r300, int flags, int buffer)
+ 			     R300_S_FRONT_ZFAIL_OP_SHIFT);
+ 		}
+ 
+-		e32(t1);
+-		e32(t2);
+-		e32(((ctx->Stencil.WriteMask[0] & R300_STENCILREF_MASK) << R300_STENCILWRITEMASK_SHIFT) |
+-		    (ctx->Stencil.Clear & R300_STENCILREF_MASK));
++		OUT_BATCH_REGSEQ(R300_ZB_CNTL, 3);
++		OUT_BATCH(t1);
++		OUT_BATCH(t2);
++		OUT_BATCH(((ctx->Stencil.WriteMask[0] & R300_STENCILREF_MASK) <<
++                   R300_STENCILWRITEMASK_SHIFT) |
++			  (ctx->Stencil.Clear & R300_STENCILREF_MASK));
++		END_BATCH();
+ 	}
+ 
+-	cmd2 = (drm_r300_cmd_header_t *) r300AllocCmdBuf(r300, 9, __FUNCTION__);
+-	cmd2[0].packet3.cmd_type = R300_CMD_PACKET3;
+-	cmd2[0].packet3.packet = R300_CMD_PACKET3_CLEAR;
+-	cmd2[1].u = r300PackFloat32(dPriv->w / 2.0);
+-	cmd2[2].u = r300PackFloat32(dPriv->h / 2.0);
+-	cmd2[3].u = r300PackFloat32(ctx->Depth.Clear);
+-	cmd2[4].u = r300PackFloat32(1.0);
+-	cmd2[5].u = r300PackFloat32(ctx->Color.ClearColor[0]);
+-	cmd2[6].u = r300PackFloat32(ctx->Color.ClearColor[1]);
+-	cmd2[7].u = r300PackFloat32(ctx->Color.ClearColor[2]);
+-	cmd2[8].u = r300PackFloat32(ctx->Color.ClearColor[3]);
+-
++	if (!rmesa->radeon.radeonScreen->kernel_mm) {
++		BEGIN_BATCH_NO_AUTOSTATE(9);
++		OUT_BATCH(cmdpacket3(r300->radeon.radeonScreen, R300_CMD_PACKET3_CLEAR));
++		OUT_BATCH_FLOAT32(dPriv->w / 2.0);
++		OUT_BATCH_FLOAT32(dPriv->h / 2.0);
++		OUT_BATCH_FLOAT32(ctx->Depth.Clear);
++		OUT_BATCH_FLOAT32(1.0);
++		OUT_BATCH_FLOAT32(ctx->Color.ClearColor[0]);
++		OUT_BATCH_FLOAT32(ctx->Color.ClearColor[1]);
++		OUT_BATCH_FLOAT32(ctx->Color.ClearColor[2]);
++		OUT_BATCH_FLOAT32(ctx->Color.ClearColor[3]);
++		END_BATCH();
++	} else {
++		OUT_BATCH(CP_PACKET3(R200_3D_DRAW_IMMD_2, 8));
++		OUT_BATCH(R300_PRIM_TYPE_POINT | R300_PRIM_WALK_RING |
++			  (1 << R300_PRIM_NUM_VERTICES_SHIFT));
++		OUT_BATCH_FLOAT32(dPriv->w / 2.0);
++		OUT_BATCH_FLOAT32(dPriv->h / 2.0);
++		OUT_BATCH_FLOAT32(ctx->Depth.Clear);
++		OUT_BATCH_FLOAT32(1.0);
++		OUT_BATCH_FLOAT32(ctx->Color.ClearColor[0]);
++		OUT_BATCH_FLOAT32(ctx->Color.ClearColor[1]);
++		OUT_BATCH_FLOAT32(ctx->Color.ClearColor[2]);
++		OUT_BATCH_FLOAT32(ctx->Color.ClearColor[3]);
++	}
++	
+ 	r300EmitCacheFlush(rmesa);
+-	cp_wait(rmesa, R300_WAIT_3D | R300_WAIT_3D_CLEAN);
++	cp_wait(&r300->radeon, R300_WAIT_3D | R300_WAIT_3D_CLEAN);
++
++	R300_STATECHANGE(r300, cb);
++	R300_STATECHANGE(r300, cmk);
++	R300_STATECHANGE(r300, zs);
+ }
+ 
+ static void r300EmitClearState(GLcontext * ctx)
+ {
+ 	r300ContextPtr r300 = R300_CONTEXT(ctx);
+-	r300ContextPtr rmesa = r300;
++	BATCH_LOCALS(&r300->radeon);
+ 	__DRIdrawablePrivate *dPriv = r300->radeon.dri.drawable;
+ 	int i;
+-	int cmd_reserved = 0;
+-	int cmd_written = 0;
+-	drm_radeon_cmd_header_t *cmd = NULL;
+ 	int has_tcl = 1;
+ 	int is_r500 = 0;
+ 	GLuint vap_cntl;
+@@ -184,35 +215,37 @@ static void r300EmitClearState(GLcontext * ctx)
+ 	if (!(r300->radeon.radeonScreen->chip_flags & RADEON_CHIPSET_TCL))
+ 		has_tcl = 0;
+ 
+-        if (r300->radeon.radeonScreen->chip_family >= CHIP_FAMILY_RV515)
+-                is_r500 = 1;
++	if (r300->radeon.radeonScreen->chip_family >= CHIP_FAMILY_RV515)
++		is_r500 = 1;
+ 
+-
+-	/* FIXME: the values written to R300_VAP_INPUT_ROUTE_0_0 and
+-	 * R300_VAP_INPUT_ROUTE_0_1 are in fact known, however, the values are
+-	 * quite complex; see the functions in r300_emit.c.
++	/* State atom dirty tracking is a little subtle here.
++	 *
++	 * On the one hand, we need to make sure base state is emitted
++	 * here if we start with an empty batch buffer, otherwise clear
++	 * works incorrectly with multiple processes. Therefore, the first
++	 * BEGIN_BATCH cannot be a BEGIN_BATCH_NO_AUTOSTATE.
++	 *
++	 * On the other hand, implicit state emission clears the state atom
++	 * dirty bits, so we have to call R300_STATECHANGE later than the
++	 * first BEGIN_BATCH.
+ 	 *
+-	 * I believe it would be a good idea to extend the functions in
+-	 * r300_emit.c so that they can be used to setup the default values for
+-	 * these registers, as well as the actual values used for rendering.
++	 * The final trickiness is that, because we change state, we need
++	 * to ensure that any stored swtcl primitives are flushed properly
++	 * before we start changing state. See the R300_NEWPRIM in r300Clear
++	 * for this.
+ 	 */
+-	R300_STATECHANGE(r300, vir[0]);
+-	reg_start(R300_VAP_PROG_STREAM_CNTL_0, 0);
++	BEGIN_BATCH(31);
++	OUT_BATCH_REGSEQ(R300_VAP_PROG_STREAM_CNTL_0, 1);
+ 	if (!has_tcl)
+-	    e32(((((0 << R300_DST_VEC_LOC_SHIFT) | R300_DATA_TYPE_FLOAT_4) << R300_DATA_TYPE_0_SHIFT) |
++		OUT_BATCH(((((0 << R300_DST_VEC_LOC_SHIFT) | R300_DATA_TYPE_FLOAT_4) << R300_DATA_TYPE_0_SHIFT) |
+ 		 ((R300_LAST_VEC | (2 << R300_DST_VEC_LOC_SHIFT) | R300_DATA_TYPE_FLOAT_4) << R300_DATA_TYPE_1_SHIFT)));
+ 	else
+-	    e32(((((0 << R300_DST_VEC_LOC_SHIFT) | R300_DATA_TYPE_FLOAT_4) << R300_DATA_TYPE_0_SHIFT) |
++		OUT_BATCH(((((0 << R300_DST_VEC_LOC_SHIFT) | R300_DATA_TYPE_FLOAT_4) << R300_DATA_TYPE_0_SHIFT) |
+ 		 ((R300_LAST_VEC | (1 << R300_DST_VEC_LOC_SHIFT) | R300_DATA_TYPE_FLOAT_4) << R300_DATA_TYPE_1_SHIFT)));
+ 
+-	/* disable fog */
+-	R300_STATECHANGE(r300, fogs);
+-	reg_start(R300_FG_FOG_BLEND, 0);
+-	e32(0x0);
+-
+-	R300_STATECHANGE(r300, vir[1]);
+-	reg_start(R300_VAP_PROG_STREAM_CNTL_EXT_0, 0);
+-	e32(((((R300_SWIZZLE_SELECT_X << R300_SWIZZLE_SELECT_X_SHIFT) |
++	OUT_BATCH_REGVAL(R300_FG_FOG_BLEND, 0);
++	OUT_BATCH_REGVAL(R300_VAP_PROG_STREAM_CNTL_EXT_0,
++	   ((((R300_SWIZZLE_SELECT_X << R300_SWIZZLE_SELECT_X_SHIFT) |
+ 	       (R300_SWIZZLE_SELECT_Y << R300_SWIZZLE_SELECT_Y_SHIFT) |
+ 	       (R300_SWIZZLE_SELECT_Z << R300_SWIZZLE_SELECT_Z_SHIFT) |
+ 	       (R300_SWIZZLE_SELECT_W << R300_SWIZZLE_SELECT_W_SHIFT) |
+@@ -226,238 +259,278 @@ static void r300EmitClearState(GLcontext * ctx)
+ 	      << R300_SWIZZLE1_SHIFT)));
+ 
+ 	/* R300_VAP_INPUT_CNTL_0, R300_VAP_INPUT_CNTL_1 */
+-	R300_STATECHANGE(r300, vic);
+-	reg_start(R300_VAP_VTX_STATE_CNTL, 1);
+-	e32((R300_SEL_USER_COLOR_0 << R300_COLOR_0_ASSEMBLY_SHIFT));
+-	e32(R300_INPUT_CNTL_POS | R300_INPUT_CNTL_COLOR | R300_INPUT_CNTL_TC0);
++	OUT_BATCH_REGSEQ(R300_VAP_VTX_STATE_CNTL, 2);
++	OUT_BATCH((R300_SEL_USER_COLOR_0 << R300_COLOR_0_ASSEMBLY_SHIFT));
++	OUT_BATCH(R300_INPUT_CNTL_POS | R300_INPUT_CNTL_COLOR | R300_INPUT_CNTL_TC0);
+ 
+-	R300_STATECHANGE(r300, vte);
+ 	/* comes from fglrx startup of clear */
+-	reg_start(R300_SE_VTE_CNTL, 1);
+-	e32(R300_VTX_W0_FMT | R300_VPORT_X_SCALE_ENA |
+-	    R300_VPORT_X_OFFSET_ENA | R300_VPORT_Y_SCALE_ENA |
+-	    R300_VPORT_Y_OFFSET_ENA | R300_VPORT_Z_SCALE_ENA |
+-	    R300_VPORT_Z_OFFSET_ENA);
+-	e32(0x8);
++	OUT_BATCH_REGSEQ(R300_SE_VTE_CNTL, 2);
++	OUT_BATCH(R300_VTX_W0_FMT | R300_VPORT_X_SCALE_ENA |
++		  R300_VPORT_X_OFFSET_ENA | R300_VPORT_Y_SCALE_ENA |
++		  R300_VPORT_Y_OFFSET_ENA | R300_VPORT_Z_SCALE_ENA |
++		  R300_VPORT_Z_OFFSET_ENA);
++	OUT_BATCH(0x8);
+ 
+-	reg_start(R300_VAP_PSC_SGN_NORM_CNTL, 0);
+-	e32(0xaaaaaaaa);
++	OUT_BATCH_REGVAL(R300_VAP_PSC_SGN_NORM_CNTL, 0xaaaaaaaa);
+ 
+-	R300_STATECHANGE(r300, vof);
+-	reg_start(R300_VAP_OUTPUT_VTX_FMT_0, 1);
+-	e32(R300_VAP_OUTPUT_VTX_FMT_0__POS_PRESENT |
+-	    R300_VAP_OUTPUT_VTX_FMT_0__COLOR_0_PRESENT);
+-	e32(0x0);		/* no textures */
++	OUT_BATCH_REGSEQ(R300_VAP_OUTPUT_VTX_FMT_0, 2);
++	OUT_BATCH(R300_VAP_OUTPUT_VTX_FMT_0__POS_PRESENT |
++		  R300_VAP_OUTPUT_VTX_FMT_0__COLOR_0_PRESENT);
++	OUT_BATCH(0); /* no textures */
+ 
+-	R300_STATECHANGE(r300, txe);
+-	reg_start(R300_TX_ENABLE, 0);
+-	e32(0x0);
++	OUT_BATCH_REGVAL(R300_TX_ENABLE, 0);
+ 
+-	R300_STATECHANGE(r300, vpt);
+-	reg_start(R300_SE_VPORT_XSCALE, 5);
+-	efloat(1.0);
+-	efloat(dPriv->x);
+-	efloat(1.0);
+-	efloat(dPriv->y);
+-	efloat(1.0);
+-	efloat(0.0);
++	OUT_BATCH_REGSEQ(R300_SE_VPORT_XSCALE, 6);
++	OUT_BATCH_FLOAT32(1.0);
++	OUT_BATCH_FLOAT32(dPriv->x);
++	OUT_BATCH_FLOAT32(1.0);
++	OUT_BATCH_FLOAT32(dPriv->y);
++	OUT_BATCH_FLOAT32(1.0);
++	OUT_BATCH_FLOAT32(0.0);
+ 
+-	R300_STATECHANGE(r300, at);
+-	reg_start(R300_FG_ALPHA_FUNC, 0);
+-	e32(0x0);
++	OUT_BATCH_REGVAL(R300_FG_ALPHA_FUNC, 0);
++
++	OUT_BATCH_REGSEQ(R300_RB3D_CBLEND, 2);
++	OUT_BATCH(0x0);
++	OUT_BATCH(0x0);
++	END_BATCH();
+ 
++	R300_STATECHANGE(r300, vir[0]);
++	R300_STATECHANGE(r300, fogs);
++	R300_STATECHANGE(r300, vir[1]);
++	R300_STATECHANGE(r300, vic);
++	R300_STATECHANGE(r300, vte);
++	R300_STATECHANGE(r300, vof);
++	R300_STATECHANGE(r300, txe);
++	R300_STATECHANGE(r300, vpt);
++	R300_STATECHANGE(r300, at);
+ 	R300_STATECHANGE(r300, bld);
+-	reg_start(R300_RB3D_CBLEND, 1);
+-	e32(0x0);
+-	e32(0x0);
++	R300_STATECHANGE(r300, ps);
+ 
+ 	if (has_tcl) {
+-	    R300_STATECHANGE(r300, vap_clip_cntl);
+-	    reg_start(R300_VAP_CLIP_CNTL, 0);
+-	    e32(R300_PS_UCP_MODE_CLIP_AS_TRIFAN | R300_CLIP_DISABLE);
++		R300_STATECHANGE(r300, vap_clip_cntl);
++
++		BEGIN_BATCH_NO_AUTOSTATE(2);
++		OUT_BATCH_REGVAL(R300_VAP_CLIP_CNTL, R300_PS_UCP_MODE_CLIP_AS_TRIFAN | R300_CLIP_DISABLE);
++		END_BATCH();
+         }
+ 
+-	R300_STATECHANGE(r300, ps);
+-	reg_start(R300_GA_POINT_SIZE, 0);
+-	e32(((dPriv->w * 6) << R300_POINTSIZE_X_SHIFT) |
+-	    ((dPriv->h * 6) << R300_POINTSIZE_Y_SHIFT));
++	BEGIN_BATCH_NO_AUTOSTATE(2);
++	OUT_BATCH_REGVAL(R300_GA_POINT_SIZE,
++		((dPriv->w * 6) << R300_POINTSIZE_X_SHIFT) |
++		((dPriv->h * 6) << R300_POINTSIZE_Y_SHIFT));
++	END_BATCH();
+ 
+ 	if (!is_r500) {
+ 		R300_STATECHANGE(r300, ri);
+-		reg_start(R300_RS_IP_0, 7);
+-		for (i = 0; i < 8; ++i) {
+-			e32(R300_RS_SEL_T(1) | R300_RS_SEL_R(2) | R300_RS_SEL_Q(3));
+-		}
+-
+ 		R300_STATECHANGE(r300, rc);
+-		/* The second constant is needed to get glxgears display anything .. */
+-		reg_start(R300_RS_COUNT, 1);
+-		e32((1 << R300_IC_COUNT_SHIFT) | R300_HIRES_EN);
+-		e32(0x0);
+-
+ 		R300_STATECHANGE(r300, rr);
+-		reg_start(R300_RS_INST_0, 0);
+-		e32(R300_RS_INST_COL_CN_WRITE);
++
++		BEGIN_BATCH(14);
++		OUT_BATCH_REGSEQ(R300_RS_IP_0, 8);
++		for (i = 0; i < 8; ++i)
++			OUT_BATCH(R300_RS_SEL_T(1) | R300_RS_SEL_R(2) | R300_RS_SEL_Q(3));
++
++		OUT_BATCH_REGSEQ(R300_RS_COUNT, 2);
++		OUT_BATCH((1 << R300_IC_COUNT_SHIFT) | R300_HIRES_EN);
++		OUT_BATCH(0x0);
++
++		OUT_BATCH_REGVAL(R300_RS_INST_0, R300_RS_INST_COL_CN_WRITE);
++		END_BATCH();
+ 	} else {
+ 		R300_STATECHANGE(r300, ri);
+-		reg_start(R500_RS_IP_0, 7);
++		R300_STATECHANGE(r300, rc);
++		R300_STATECHANGE(r300, rr);
++
++		BEGIN_BATCH(14);
++		OUT_BATCH_REGSEQ(R500_RS_IP_0, 8);
+ 		for (i = 0; i < 8; ++i) {
+-			e32((R500_RS_IP_PTR_K0 << R500_RS_IP_TEX_PTR_S_SHIFT) |
+-			    (R500_RS_IP_PTR_K0 << R500_RS_IP_TEX_PTR_T_SHIFT) |
+-			    (R500_RS_IP_PTR_K0 << R500_RS_IP_TEX_PTR_R_SHIFT) |
+-			    (R500_RS_IP_PTR_K1 << R500_RS_IP_TEX_PTR_Q_SHIFT));
++			OUT_BATCH((R500_RS_IP_PTR_K0 << R500_RS_IP_TEX_PTR_S_SHIFT) |
++				  (R500_RS_IP_PTR_K0 << R500_RS_IP_TEX_PTR_T_SHIFT) |
++				  (R500_RS_IP_PTR_K0 << R500_RS_IP_TEX_PTR_R_SHIFT) |
++				  (R500_RS_IP_PTR_K1 << R500_RS_IP_TEX_PTR_Q_SHIFT));
+ 		}
+ 
+-		R300_STATECHANGE(r300, rc);
+-		/* The second constant is needed to get glxgears display anything .. */
+-		reg_start(R300_RS_COUNT, 1);
+-		e32((1 << R300_IC_COUNT_SHIFT) | R300_HIRES_EN);
+-		e32(0x0);
+-
+-		R300_STATECHANGE(r300, rr);
+-		reg_start(R500_RS_INST_0, 0);
+-		e32(R500_RS_INST_COL_CN_WRITE);
++		OUT_BATCH_REGSEQ(R300_RS_COUNT, 2);
++		OUT_BATCH((1 << R300_IC_COUNT_SHIFT) | R300_HIRES_EN);
++		OUT_BATCH(0x0);
+ 
++		OUT_BATCH_REGVAL(R500_RS_INST_0, R500_RS_INST_COL_CN_WRITE);
++		END_BATCH();
+ 	}
+ 
+ 	if (!is_r500) {
+ 		R300_STATECHANGE(r300, fp);
+-		reg_start(R300_US_CONFIG, 2);
+-		e32(0x0);
+-		e32(0x0);
+-		e32(0x0);
+-		reg_start(R300_US_CODE_ADDR_0, 3);
+-		e32(0x0);
+-		e32(0x0);
+-		e32(0x0);
+-		e32(R300_RGBA_OUT);
+-
+ 		R300_STATECHANGE(r300, fpi[0]);
+ 		R300_STATECHANGE(r300, fpi[1]);
+ 		R300_STATECHANGE(r300, fpi[2]);
+ 		R300_STATECHANGE(r300, fpi[3]);
+ 
+-		reg_start(R300_US_ALU_RGB_INST_0, 0);
+-		e32(FP_INSTRC(MAD, FP_ARGC(SRC0C_XYZ), FP_ARGC(ONE), FP_ARGC(ZERO)));
+-
+-		reg_start(R300_US_ALU_RGB_ADDR_0, 0);
+-		e32(FP_SELC(0, NO, XYZ, FP_TMP(0), 0, 0));
+-
+-		reg_start(R300_US_ALU_ALPHA_INST_0, 0);
+-		e32(FP_INSTRA(MAD, FP_ARGA(SRC0A), FP_ARGA(ONE), FP_ARGA(ZERO)));
+-
+-		reg_start(R300_US_ALU_ALPHA_ADDR_0, 0);
+-		e32(FP_SELA(0, NO, W, FP_TMP(0), 0, 0));
++		BEGIN_BATCH(17);
++		OUT_BATCH_REGSEQ(R300_US_CONFIG, 3);
++		OUT_BATCH(0x0);
++		OUT_BATCH(0x0);
++		OUT_BATCH(0x0);
++		OUT_BATCH_REGSEQ(R300_US_CODE_ADDR_0, 4);
++		OUT_BATCH(0x0);
++		OUT_BATCH(0x0);
++		OUT_BATCH(0x0);
++		OUT_BATCH(R300_RGBA_OUT);
++
++		OUT_BATCH_REGVAL(R300_US_ALU_RGB_INST_0,
++			FP_INSTRC(MAD, FP_ARGC(SRC0C_XYZ), FP_ARGC(ONE), FP_ARGC(ZERO)));
++		OUT_BATCH_REGVAL(R300_US_ALU_RGB_ADDR_0,
++			FP_SELC(0, NO, XYZ, FP_TMP(0), 0, 0));
++		OUT_BATCH_REGVAL(R300_US_ALU_ALPHA_INST_0,
++			FP_INSTRA(MAD, FP_ARGA(SRC0A), FP_ARGA(ONE), FP_ARGA(ZERO)));
++		OUT_BATCH_REGVAL(R300_US_ALU_ALPHA_ADDR_0,
++			FP_SELA(0, NO, W, FP_TMP(0), 0, 0));
++		END_BATCH();
+ 	} else {
+- 		R300_STATECHANGE(r300, fp);
+- 		reg_start(R500_US_CONFIG, 1);
+- 		e32(R500_ZERO_TIMES_ANYTHING_EQUALS_ZERO);
+- 		e32(0x0);
+- 		reg_start(R500_US_CODE_ADDR, 2);
+- 		e32(R500_US_CODE_START_ADDR(0) | R500_US_CODE_END_ADDR(1));
+- 		e32(R500_US_CODE_RANGE_ADDR(0) | R500_US_CODE_RANGE_SIZE(1));
+- 		e32(R500_US_CODE_OFFSET_ADDR(0));
++		struct radeon_state_atom r500fp;
++		uint32_t _cmd[10];
+ 
++		R300_STATECHANGE(r300, fp);
+ 		R300_STATECHANGE(r300, r500fp);
+-		r500fp_start_fragment(0, 6);
+-
+-		e32(R500_INST_TYPE_OUT |
+-		    R500_INST_TEX_SEM_WAIT |
+-		    R500_INST_LAST |
+-		    R500_INST_RGB_OMASK_R |
+-		    R500_INST_RGB_OMASK_G |
+-		    R500_INST_RGB_OMASK_B |
+-		    R500_INST_ALPHA_OMASK |
+-		    R500_INST_RGB_CLAMP |
+-		    R500_INST_ALPHA_CLAMP);
+-
+-		e32(R500_RGB_ADDR0(0) |
+-		    R500_RGB_ADDR1(0) |
+-		    R500_RGB_ADDR1_CONST |
+-		    R500_RGB_ADDR2(0) |
+-		    R500_RGB_ADDR2_CONST);
+-
+-		e32(R500_ALPHA_ADDR0(0) |
+-		    R500_ALPHA_ADDR1(0) |
+-		    R500_ALPHA_ADDR1_CONST |
+-		    R500_ALPHA_ADDR2(0) |
+-		    R500_ALPHA_ADDR2_CONST);
+-
+-		e32(R500_ALU_RGB_SEL_A_SRC0 |
+-		    R500_ALU_RGB_R_SWIZ_A_R |
+-		    R500_ALU_RGB_G_SWIZ_A_G |
+-		    R500_ALU_RGB_B_SWIZ_A_B |
+-		    R500_ALU_RGB_SEL_B_SRC0 |
+-		    R500_ALU_RGB_R_SWIZ_B_R |
+-		    R500_ALU_RGB_B_SWIZ_B_G |
+-		    R500_ALU_RGB_G_SWIZ_B_B);
+-
+-		e32(R500_ALPHA_OP_CMP |
+-		    R500_ALPHA_SWIZ_A_A |
+-		    R500_ALPHA_SWIZ_B_A);
+-
+-		e32(R500_ALU_RGBA_OP_CMP |
+-		    R500_ALU_RGBA_R_SWIZ_0 |
+-		    R500_ALU_RGBA_G_SWIZ_0 |
+-		    R500_ALU_RGBA_B_SWIZ_0 |
+-		    R500_ALU_RGBA_A_SWIZ_0);
++
++		BEGIN_BATCH(7);
++		OUT_BATCH_REGSEQ(R500_US_CONFIG, 2);
++		OUT_BATCH(R500_ZERO_TIMES_ANYTHING_EQUALS_ZERO);
++		OUT_BATCH(0x0);
++		OUT_BATCH_REGSEQ(R500_US_CODE_ADDR, 3);
++		OUT_BATCH(R500_US_CODE_START_ADDR(0) | R500_US_CODE_END_ADDR(1));
++		OUT_BATCH(R500_US_CODE_RANGE_ADDR(0) | R500_US_CODE_RANGE_SIZE(1));
++		OUT_BATCH(R500_US_CODE_OFFSET_ADDR(0));
++		END_BATCH();
++
++		r500fp.check = check_r500fp;
++		r500fp.cmd = _cmd;
++		r500fp.cmd[0] = cmdr500fp(r300->radeon.radeonScreen, 0, 1, 0, 0);
++		r500fp.cmd[1] = R500_INST_TYPE_OUT |
++			R500_INST_TEX_SEM_WAIT |
++			R500_INST_LAST |
++			R500_INST_RGB_OMASK_R |
++			R500_INST_RGB_OMASK_G |
++			R500_INST_RGB_OMASK_B |
++			R500_INST_ALPHA_OMASK |
++			R500_INST_RGB_CLAMP |
++			R500_INST_ALPHA_CLAMP;
++		r500fp.cmd[2] = R500_RGB_ADDR0(0) |
++			R500_RGB_ADDR1(0) |
++			R500_RGB_ADDR1_CONST |
++			R500_RGB_ADDR2(0) |
++			R500_RGB_ADDR2_CONST;
++		r500fp.cmd[3] = R500_ALPHA_ADDR0(0) |
++			R500_ALPHA_ADDR1(0) |
++			R500_ALPHA_ADDR1_CONST |
++			R500_ALPHA_ADDR2(0) |
++			R500_ALPHA_ADDR2_CONST;
++		r500fp.cmd[4] = R500_ALU_RGB_SEL_A_SRC0 |
++			R500_ALU_RGB_R_SWIZ_A_R |
++			R500_ALU_RGB_G_SWIZ_A_G |
++			R500_ALU_RGB_B_SWIZ_A_B |
++			R500_ALU_RGB_SEL_B_SRC0 |
++			R500_ALU_RGB_R_SWIZ_B_R |
++			R500_ALU_RGB_B_SWIZ_B_G |
++			R500_ALU_RGB_G_SWIZ_B_B;
++		r500fp.cmd[5] = R500_ALPHA_OP_CMP |
++			R500_ALPHA_SWIZ_A_A |
++			R500_ALPHA_SWIZ_B_A;
++		r500fp.cmd[6] = R500_ALU_RGBA_OP_CMP |
++			R500_ALU_RGBA_R_SWIZ_0 |
++			R500_ALU_RGBA_G_SWIZ_0 |
++			R500_ALU_RGBA_B_SWIZ_0 |
++			R500_ALU_RGBA_A_SWIZ_0;
++		
++		r500fp.cmd[7] = 0;
++		emit_r500fp(ctx, &r500fp);
+ 	}
+ 
+-	reg_start(R300_VAP_PVS_STATE_FLUSH_REG, 0);
+-	e32(0x00000000);
++	BEGIN_BATCH(2);
++	OUT_BATCH_REGVAL(R300_VAP_PVS_STATE_FLUSH_REG, 0);
++	END_BATCH();
++
+ 	if (has_tcl) {
+-	    vap_cntl = ((10 << R300_PVS_NUM_SLOTS_SHIFT) |
++		vap_cntl = ((10 << R300_PVS_NUM_SLOTS_SHIFT) |
+ 			(5 << R300_PVS_NUM_CNTLRS_SHIFT) |
+ 			(12 << R300_VF_MAX_VTX_NUM_SHIFT));
+-	    if (r300->radeon.radeonScreen->chip_family >= CHIP_FAMILY_RV515)
+-		vap_cntl |= R500_TCL_STATE_OPTIMIZATION;
+-	} else
+-	    vap_cntl = ((10 << R300_PVS_NUM_SLOTS_SHIFT) |
++		if (r300->radeon.radeonScreen->chip_family >= CHIP_FAMILY_RV515)
++			vap_cntl |= R500_TCL_STATE_OPTIMIZATION;
++	} else {
++		vap_cntl = ((10 << R300_PVS_NUM_SLOTS_SHIFT) |
+ 			(5 << R300_PVS_NUM_CNTLRS_SHIFT) |
+ 			(5 << R300_VF_MAX_VTX_NUM_SHIFT));
++	}
+ 
+ 	if (r300->radeon.radeonScreen->chip_family == CHIP_FAMILY_RV515)
+-	    vap_cntl |= (2 << R300_PVS_NUM_FPUS_SHIFT);
++		vap_cntl |= (2 << R300_PVS_NUM_FPUS_SHIFT);
+ 	else if ((r300->radeon.radeonScreen->chip_family == CHIP_FAMILY_RV530) ||
+ 		 (r300->radeon.radeonScreen->chip_family == CHIP_FAMILY_RV560) ||
+ 		 (r300->radeon.radeonScreen->chip_family == CHIP_FAMILY_RV570))
+-	    vap_cntl |= (5 << R300_PVS_NUM_FPUS_SHIFT);
++		vap_cntl |= (5 << R300_PVS_NUM_FPUS_SHIFT);
+ 	else if ((r300->radeon.radeonScreen->chip_family == CHIP_FAMILY_RV410) ||
+ 		 (r300->radeon.radeonScreen->chip_family == CHIP_FAMILY_R420))
+-	    vap_cntl |= (6 << R300_PVS_NUM_FPUS_SHIFT);
++		vap_cntl |= (6 << R300_PVS_NUM_FPUS_SHIFT);
+ 	else if ((r300->radeon.radeonScreen->chip_family == CHIP_FAMILY_R520) ||
+ 		 (r300->radeon.radeonScreen->chip_family == CHIP_FAMILY_R580))
+-	    vap_cntl |= (8 << R300_PVS_NUM_FPUS_SHIFT);
++		vap_cntl |= (8 << R300_PVS_NUM_FPUS_SHIFT);
+ 	else
+-	    vap_cntl |= (4 << R300_PVS_NUM_FPUS_SHIFT);
++		vap_cntl |= (4 << R300_PVS_NUM_FPUS_SHIFT);
++
++	R300_STATECHANGE(r300, vap_cntl);
+ 
+-	R300_STATECHANGE(rmesa, vap_cntl);
+-	reg_start(R300_VAP_CNTL, 0);
+-	e32(vap_cntl);
++	BEGIN_BATCH(2);
++	OUT_BATCH_REGVAL(R300_VAP_CNTL, vap_cntl);
++	END_BATCH();
+ 
+ 	if (has_tcl) {
++        struct radeon_state_atom vpu;
++        uint32_t _cmd[10];
+ 		R300_STATECHANGE(r300, pvs);
+-		reg_start(R300_VAP_PVS_CODE_CNTL_0, 2);
+-
+-		e32((0 << R300_PVS_FIRST_INST_SHIFT) |
+-		    (0 << R300_PVS_XYZW_VALID_INST_SHIFT) |
+-		    (1 << R300_PVS_LAST_INST_SHIFT));
+-		e32((0 << R300_PVS_CONST_BASE_OFFSET_SHIFT) |
+-		    (0 << R300_PVS_MAX_CONST_ADDR_SHIFT));
+-		e32(1 << R300_PVS_LAST_VTX_SRC_INST_SHIFT);
+-
+ 		R300_STATECHANGE(r300, vpi);
+-		vsf_start_fragment(0x0, 8);
+-
+-		e32(PVS_OP_DST_OPERAND(VE_ADD, GL_FALSE, GL_FALSE, 0, 0xf, PVS_DST_REG_OUT));
+-		e32(PVS_SRC_OPERAND(0, PVS_SRC_SELECT_X, PVS_SRC_SELECT_Y, PVS_SRC_SELECT_Z, PVS_SRC_SELECT_W, PVS_SRC_REG_INPUT, VSF_FLAG_NONE));
+-		e32(PVS_SRC_OPERAND(0, PVS_SRC_SELECT_FORCE_0, PVS_SRC_SELECT_FORCE_0, PVS_SRC_SELECT_FORCE_0, PVS_SRC_SELECT_FORCE_0, PVS_SRC_REG_INPUT, VSF_FLAG_NONE));
+-		e32(0x0);
+ 
+-		e32(PVS_OP_DST_OPERAND(VE_ADD, GL_FALSE, GL_FALSE, 1, 0xf, PVS_DST_REG_OUT));
+-		e32(PVS_SRC_OPERAND(1, PVS_SRC_SELECT_X, PVS_SRC_SELECT_Y, PVS_SRC_SELECT_Z, PVS_SRC_SELECT_W, PVS_SRC_REG_INPUT, VSF_FLAG_NONE));
+-		e32(PVS_SRC_OPERAND(1, PVS_SRC_SELECT_FORCE_0, PVS_SRC_SELECT_FORCE_0, PVS_SRC_SELECT_FORCE_0, PVS_SRC_SELECT_FORCE_0, PVS_SRC_REG_INPUT, VSF_FLAG_NONE));
+-		e32(0x0);
++		BEGIN_BATCH(4);
++		OUT_BATCH_REGSEQ(R300_VAP_PVS_CODE_CNTL_0, 3);
++		OUT_BATCH((0 << R300_PVS_FIRST_INST_SHIFT) |
++			  (0 << R300_PVS_XYZW_VALID_INST_SHIFT) |
++			  (1 << R300_PVS_LAST_INST_SHIFT));
++		OUT_BATCH((0 << R300_PVS_CONST_BASE_OFFSET_SHIFT) |
++			  (0 << R300_PVS_MAX_CONST_ADDR_SHIFT));
++		OUT_BATCH(1 << R300_PVS_LAST_VTX_SRC_INST_SHIFT);
++		END_BATCH();
++
++		vpu.check = check_vpu;
++		vpu.cmd = _cmd;
++		vpu.cmd[0] = cmdvpu(r300->radeon.radeonScreen, 0, 2);
++
++		vpu.cmd[1] = PVS_OP_DST_OPERAND(VE_ADD, GL_FALSE, GL_FALSE,
++                                         0, 0xf, PVS_DST_REG_OUT);
++		vpu.cmd[2] = PVS_SRC_OPERAND(0, PVS_SRC_SELECT_X, PVS_SRC_SELECT_Y,
++                                      PVS_SRC_SELECT_Z, PVS_SRC_SELECT_W,
++                                      PVS_SRC_REG_INPUT, VSF_FLAG_NONE);
++		vpu.cmd[3] = PVS_SRC_OPERAND(0, PVS_SRC_SELECT_FORCE_0,
++                                      PVS_SRC_SELECT_FORCE_0,
++                                      PVS_SRC_SELECT_FORCE_0,
++                                      PVS_SRC_SELECT_FORCE_0,
++                                      PVS_SRC_REG_INPUT, VSF_FLAG_NONE);
++		vpu.cmd[4] = 0x0;
++
++		vpu.cmd[5] = PVS_OP_DST_OPERAND(VE_ADD, GL_FALSE, GL_FALSE, 1, 0xf,
++                                         PVS_DST_REG_OUT);
++		vpu.cmd[6] = PVS_SRC_OPERAND(1, PVS_SRC_SELECT_X,
++                                      PVS_SRC_SELECT_Y, PVS_SRC_SELECT_Z,
++                                      PVS_SRC_SELECT_W, PVS_SRC_REG_INPUT,
++
++                                      VSF_FLAG_NONE);
++		vpu.cmd[7] = PVS_SRC_OPERAND(1, PVS_SRC_SELECT_FORCE_0,
++                                      PVS_SRC_SELECT_FORCE_0,
++                                      PVS_SRC_SELECT_FORCE_0,
++                                      PVS_SRC_SELECT_FORCE_0,
++                                      PVS_SRC_REG_INPUT, VSF_FLAG_NONE);
++		vpu.cmd[8] = 0x0;
++
++		r300->vap_flush_needed = GL_TRUE;
++		emit_vpu(ctx, &vpu);
+ 	}
+ }
+ 
+@@ -468,9 +541,11 @@ static void r300Clear(GLcontext * ctx, GLbitfield mask)
+ {
+ 	r300ContextPtr r300 = R300_CONTEXT(ctx);
+ 	__DRIdrawablePrivate *dPriv = r300->radeon.dri.drawable;
++	GLframebuffer *fb = dPriv->driverPrivate;
++	struct radeon_renderbuffer *rrb;
++	struct radeon_renderbuffer *rrbd;
+ 	int flags = 0;
+ 	int bits = 0;
+-	int swapped;
+ 
+ 	if (RADEON_DEBUG & DEBUG_IOCTL)
+ 		fprintf(stderr, "r300Clear\n");
+@@ -482,6 +557,12 @@ static void r300Clear(GLcontext * ctx, GLbitfield mask)
+ 			return;
+ 	}
+ 
++	/* Flush swtcl vertices if necessary, because we will change hardware
++	 * state during clear. See also the state-related comment in
++	 * r300EmitClearState.
++	 */
++	R300_NEWPRIM(r300);
++
+ 	if (mask & BUFFER_BIT_FRONT_LEFT) {
+ 		flags |= BUFFER_BIT_FRONT_LEFT;
+ 		mask &= ~BUFFER_BIT_FRONT_LEFT;
+@@ -497,7 +578,7 @@ static void r300Clear(GLcontext * ctx, GLbitfield mask)
+ 		mask &= ~BUFFER_BIT_DEPTH;
+ 	}
+ 
+-	if ((mask & BUFFER_BIT_STENCIL) && r300->state.stencil.hw_stencil) {
++	if ((mask & BUFFER_BIT_STENCIL) && r300->radeon.state.stencil.hwBuffer) {
+ 		bits |= CLEARBUFFER_STENCIL;
+ 		mask &= ~BUFFER_BIT_STENCIL;
+ 	}
+@@ -509,336 +590,33 @@ static void r300Clear(GLcontext * ctx, GLbitfield mask)
+ 		_swrast_Clear(ctx, mask);
+ 	}
+ 
+-	swapped = r300->radeon.sarea->pfCurrentPage == 1;
+-
+ 	/* Make sure it fits there. */
+-	r300EnsureCmdBufSpace(r300, 421 * 3, __FUNCTION__);
++	rcommonEnsureCmdBufSpace(&r300->radeon, 421 * 3, __FUNCTION__);
+ 	if (flags || bits)
+ 		r300EmitClearState(ctx);
++	rrbd = (void *)fb->Attachment[BUFFER_DEPTH].Renderbuffer;
+ 
+ 	if (flags & BUFFER_BIT_FRONT_LEFT) {
+-		r300ClearBuffer(r300, bits | CLEARBUFFER_COLOR, swapped);
++		rrb = (void *)fb->Attachment[BUFFER_FRONT_LEFT].Renderbuffer;
++		r300ClearBuffer(r300, bits | CLEARBUFFER_COLOR, rrb, rrbd);
+ 		bits = 0;
+ 	}
+ 
+ 	if (flags & BUFFER_BIT_BACK_LEFT) {
+-		r300ClearBuffer(r300, bits | CLEARBUFFER_COLOR, swapped ^ 1);
++		rrb = (void *)fb->Attachment[BUFFER_BACK_LEFT].Renderbuffer;
++		r300ClearBuffer(r300, bits | CLEARBUFFER_COLOR, rrb, rrbd);
+ 		bits = 0;
+ 	}
+ 
+ 	if (bits)
+-		r300ClearBuffer(r300, bits, 0);
++		r300ClearBuffer(r300, bits, NULL, rrbd);
+ 
+-}
+-
+-void r300Flush(GLcontext * ctx)
+-{
+-	r300ContextPtr rmesa = R300_CONTEXT(ctx);
+-
+-	if (RADEON_DEBUG & DEBUG_IOCTL)
+-		fprintf(stderr, "%s\n", __FUNCTION__);
+-
+-	if (rmesa->dma.flush)
+-		rmesa->dma.flush( rmesa );
+-
+-	if (rmesa->cmdbuf.count_used > rmesa->cmdbuf.count_reemit)
+-		r300FlushCmdBuf(rmesa, __FUNCTION__);
+-}
+-
+-#ifdef USER_BUFFERS
+-#include "r300_mem.h"
+-
+-void r300RefillCurrentDmaRegion(r300ContextPtr rmesa, int size)
+-{
+-	struct r300_dma_buffer *dmabuf;
+-	size = MAX2(size, RADEON_BUFFER_SIZE * 16);
+-
+-	if (RADEON_DEBUG & (DEBUG_IOCTL | DEBUG_DMA))
+-		fprintf(stderr, "%s\n", __FUNCTION__);
+-
+-	if (rmesa->dma.flush) {
+-		rmesa->dma.flush(rmesa);
+-	}
+-
+-	if (rmesa->dma.current.buf) {
+-#ifdef USER_BUFFERS
+-		r300_mem_use(rmesa, rmesa->dma.current.buf->id);
+-#endif
+-		r300ReleaseDmaRegion(rmesa, &rmesa->dma.current, __FUNCTION__);
+-	}
+-	if (rmesa->dma.nr_released_bufs > 4)
+-		r300FlushCmdBuf(rmesa, __FUNCTION__);
+-
+-	dmabuf = CALLOC_STRUCT(r300_dma_buffer);
+-	dmabuf->buf = (void *)1;	/* hack */
+-	dmabuf->refcount = 1;
+-
+-	dmabuf->id = r300_mem_alloc(rmesa, 4, size);
+-	if (dmabuf->id == 0) {
+-		LOCK_HARDWARE(&rmesa->radeon);	/* no need to validate */
+-
+-		r300FlushCmdBufLocked(rmesa, __FUNCTION__);
+-		radeonWaitForIdleLocked(&rmesa->radeon);
+-
+-		dmabuf->id = r300_mem_alloc(rmesa, 4, size);
+-
+-		UNLOCK_HARDWARE(&rmesa->radeon);
+-
+-		if (dmabuf->id == 0) {
+-			fprintf(stderr,
+-				"Error: Could not get dma buffer... exiting\n");
+-			_mesa_exit(-1);
+-		}
+-	}
+-
+-	rmesa->dma.current.buf = dmabuf;
+-	rmesa->dma.current.address = r300_mem_ptr(rmesa, dmabuf->id);
+-	rmesa->dma.current.end = size;
+-	rmesa->dma.current.start = 0;
+-	rmesa->dma.current.ptr = 0;
+-}
+-
+-void r300ReleaseDmaRegion(r300ContextPtr rmesa,
+-			  struct r300_dma_region *region, const char *caller)
+-{
+-	if (RADEON_DEBUG & DEBUG_IOCTL)
+-		fprintf(stderr, "%s from %s\n", __FUNCTION__, caller);
+-
+-	if (!region->buf)
+-		return;
+-
+-	if (rmesa->dma.flush)
+-		rmesa->dma.flush(rmesa);
+-
+-	if (--region->buf->refcount == 0) {
+-		r300_mem_free(rmesa, region->buf->id);
+-		FREE(region->buf);
+-		rmesa->dma.nr_released_bufs++;
+-	}
+-
+-	region->buf = 0;
+-	region->start = 0;
+-}
+-
+-/* Allocates a region from rmesa->dma.current.  If there isn't enough
+- * space in current, grab a new buffer (and discard what was left of current)
+- */
+-void r300AllocDmaRegion(r300ContextPtr rmesa,
+-			struct r300_dma_region *region,
+-			int bytes, int alignment)
+-{
+-	if (RADEON_DEBUG & DEBUG_IOCTL)
+-		fprintf(stderr, "%s %d\n", __FUNCTION__, bytes);
+-
+-	if (rmesa->dma.flush)
+-		rmesa->dma.flush(rmesa);
+-
+-	if (region->buf)
+-		r300ReleaseDmaRegion(rmesa, region, __FUNCTION__);
+-
+-	alignment--;
+-	rmesa->dma.current.start = rmesa->dma.current.ptr =
+-	    (rmesa->dma.current.ptr + alignment) & ~alignment;
+-
+-	if (rmesa->dma.current.ptr + bytes > rmesa->dma.current.end)
+-		r300RefillCurrentDmaRegion(rmesa, (bytes + 0x7) & ~0x7);
+-
+-	region->start = rmesa->dma.current.start;
+-	region->ptr = rmesa->dma.current.start;
+-	region->end = rmesa->dma.current.start + bytes;
+-	region->address = rmesa->dma.current.address;
+-	region->buf = rmesa->dma.current.buf;
+-	region->buf->refcount++;
+-
+-	rmesa->dma.current.ptr += bytes;	/* bug - if alignment > 7 */
+-	rmesa->dma.current.start =
+-	    rmesa->dma.current.ptr = (rmesa->dma.current.ptr + 0x7) & ~0x7;
+-
+-	assert(rmesa->dma.current.ptr <= rmesa->dma.current.end);
+-}
+-
+-#else
+-static void r300RefillCurrentDmaRegion(r300ContextPtr rmesa)
+-{
+-	struct r300_dma_buffer *dmabuf;
+-	int fd = rmesa->radeon.dri.fd;
+-	int index = 0;
+-	int size = 0;
+-	drmDMAReq dma;
+-	int ret;
+-
+-	if (RADEON_DEBUG & (DEBUG_IOCTL | DEBUG_DMA))
+-		fprintf(stderr, "%s\n", __FUNCTION__);
+-
+-	if (rmesa->dma.flush) {
+-		rmesa->dma.flush(rmesa);
+-	}
+-
+-	if (rmesa->dma.current.buf)
+-		r300ReleaseDmaRegion(rmesa, &rmesa->dma.current, __FUNCTION__);
+-
+-	if (rmesa->dma.nr_released_bufs > 4)
+-		r300FlushCmdBuf(rmesa, __FUNCTION__);
+-
+-	dma.context = rmesa->radeon.dri.hwContext;
+-	dma.send_count = 0;
+-	dma.send_list = NULL;
+-	dma.send_sizes = NULL;
+-	dma.flags = 0;
+-	dma.request_count = 1;
+-	dma.request_size = RADEON_BUFFER_SIZE;
+-	dma.request_list = &index;
+-	dma.request_sizes = &size;
+-	dma.granted_count = 0;
+-
+-	LOCK_HARDWARE(&rmesa->radeon);	/* no need to validate */
+-
+-	ret = drmDMA(fd, &dma);
+-
+-	if (ret != 0) {
+-		/* Try to release some buffers and wait until we can't get any more */
+-		if (rmesa->dma.nr_released_bufs) {
+-			r300FlushCmdBufLocked(rmesa, __FUNCTION__);
+-		}
+-
+-		if (RADEON_DEBUG & DEBUG_DMA)
+-			fprintf(stderr, "Waiting for buffers\n");
+-
+-		radeonWaitForIdleLocked(&rmesa->radeon);
+-		ret = drmDMA(fd, &dma);
+-
+-		if (ret != 0) {
+-			UNLOCK_HARDWARE(&rmesa->radeon);
+-			fprintf(stderr,
+-				"Error: Could not get dma buffer... exiting\n");
+-			_mesa_exit(-1);
+-		}
+-	}
+-
+-	UNLOCK_HARDWARE(&rmesa->radeon);
+-
+-	if (RADEON_DEBUG & DEBUG_DMA)
+-		fprintf(stderr, "Allocated buffer %d\n", index);
+-
+-	dmabuf = CALLOC_STRUCT(r300_dma_buffer);
+-	dmabuf->buf = &rmesa->radeon.radeonScreen->buffers->list[index];
+-	dmabuf->refcount = 1;
+-
+-	rmesa->dma.current.buf = dmabuf;
+-	rmesa->dma.current.address = dmabuf->buf->address;
+-	rmesa->dma.current.end = dmabuf->buf->total;
+-	rmesa->dma.current.start = 0;
+-	rmesa->dma.current.ptr = 0;
+-}
+-
+-void r300ReleaseDmaRegion(r300ContextPtr rmesa,
+-			  struct r300_dma_region *region, const char *caller)
+-{
+-	if (RADEON_DEBUG & DEBUG_IOCTL)
+-		fprintf(stderr, "%s from %s\n", __FUNCTION__, caller);
+-
+-	if (!region->buf)
+-		return;
+-
+-	if (rmesa->dma.flush)
+-		rmesa->dma.flush(rmesa);
+-
+-	if (--region->buf->refcount == 0) {
+-		drm_radeon_cmd_header_t *cmd;
+-
+-		if (RADEON_DEBUG & (DEBUG_IOCTL | DEBUG_DMA))
+-			fprintf(stderr, "%s -- DISCARD BUF %d\n",
+-				__FUNCTION__, region->buf->buf->idx);
+-		cmd =
+-		    (drm_radeon_cmd_header_t *) r300AllocCmdBuf(rmesa,
+-								sizeof
+-								(*cmd) / 4,
+-								__FUNCTION__);
+-		cmd->dma.cmd_type = R300_CMD_DMA_DISCARD;
+-		cmd->dma.buf_idx = region->buf->buf->idx;
+-
+-		FREE(region->buf);
+-		rmesa->dma.nr_released_bufs++;
+-	}
+-
+-	region->buf = 0;
+-	region->start = 0;
+-}
+-
+-/* Allocates a region from rmesa->dma.current.  If there isn't enough
+- * space in current, grab a new buffer (and discard what was left of current)
+- */
+-void r300AllocDmaRegion(r300ContextPtr rmesa,
+-			struct r300_dma_region *region,
+-			int bytes, int alignment)
+-{
+-	if (RADEON_DEBUG & DEBUG_IOCTL)
+-		fprintf(stderr, "%s %d\n", __FUNCTION__, bytes);
+-
+-	if (rmesa->dma.flush)
+-		rmesa->dma.flush(rmesa);
+-
+-	if (region->buf)
+-		r300ReleaseDmaRegion(rmesa, region, __FUNCTION__);
+-
+-	alignment--;
+-	rmesa->dma.current.start = rmesa->dma.current.ptr =
+-	    (rmesa->dma.current.ptr + alignment) & ~alignment;
+-
+-	if (rmesa->dma.current.ptr + bytes > rmesa->dma.current.end)
+-		r300RefillCurrentDmaRegion(rmesa);
+-
+-	region->start = rmesa->dma.current.start;
+-	region->ptr = rmesa->dma.current.start;
+-	region->end = rmesa->dma.current.start + bytes;
+-	region->address = rmesa->dma.current.address;
+-	region->buf = rmesa->dma.current.buf;
+-	region->buf->refcount++;
+-
+-	rmesa->dma.current.ptr += bytes;	/* bug - if alignment > 7 */
+-	rmesa->dma.current.start =
+-	    rmesa->dma.current.ptr = (rmesa->dma.current.ptr + 0x7) & ~0x7;
+-
+-	assert(rmesa->dma.current.ptr <= rmesa->dma.current.end);
+-}
+-
+-#endif
+-
+-GLboolean r300IsGartMemory(r300ContextPtr rmesa, const GLvoid * pointer,
+-			   GLint size)
+-{
+-	int offset =
+-	    (char *)pointer -
+-	    (char *)rmesa->radeon.radeonScreen->gartTextures.map;
+-	int valid = (size >= 0 && offset >= 0
+-		     && offset + size <
+-		     rmesa->radeon.radeonScreen->gartTextures.size);
+-
+-	if (RADEON_DEBUG & DEBUG_IOCTL)
+-		fprintf(stderr, "r300IsGartMemory( %p ) : %d\n", pointer,
+-			valid);
+-
+-	return valid;
+-}
+-
+-GLuint r300GartOffsetFromVirtual(r300ContextPtr rmesa, const GLvoid * pointer)
+-{
+-	int offset =
+-	    (char *)pointer -
+-	    (char *)rmesa->radeon.radeonScreen->gartTextures.map;
+-
+-	//fprintf(stderr, "offset=%08x\n", offset);
+-
+-	if (offset < 0
+-	    || offset > rmesa->radeon.radeonScreen->gartTextures.size)
+-		return ~0;
+-	else
+-		return rmesa->radeon.radeonScreen->gart_texture_offset + offset;
++	COMMIT_BATCH();
+ }
+ 
+ void r300InitIoctlFuncs(struct dd_function_table *functions)
+ {
+ 	functions->Clear = r300Clear;
+ 	functions->Finish = radeonFinish;
+-	functions->Flush = r300Flush;
++	functions->Flush = radeonFlush;
+ }
+diff --git a/src/mesa/drivers/dri/r300/r300_ioctl.h b/src/mesa/drivers/dri/r300/r300_ioctl.h
+index e1143fb..3abfa71 100644
+--- a/src/mesa/drivers/dri/r300/r300_ioctl.h
++++ b/src/mesa/drivers/dri/r300/r300_ioctl.h
+@@ -39,22 +39,6 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ #include "r300_context.h"
+ #include "radeon_drm.h"
+ 
+-extern GLboolean r300IsGartMemory(r300ContextPtr rmesa,
+-				  const GLvoid * pointer, GLint size);
+-
+-extern GLuint r300GartOffsetFromVirtual(r300ContextPtr rmesa,
+-					const GLvoid * pointer);
+-
+-extern void r300Flush(GLcontext * ctx);
+-
+-extern void r300ReleaseDmaRegion(r300ContextPtr rmesa,
+-				 struct r300_dma_region *region,
+-				 const char *caller);
+-extern void r300AllocDmaRegion(r300ContextPtr rmesa,
+-			       struct r300_dma_region *region, int bytes,
+-			       int alignment);
+-
+ extern void r300InitIoctlFuncs(struct dd_function_table *functions);
+ 
+-extern void r300RefillCurrentDmaRegion(r300ContextPtr rmesa, int size);
+ #endif				/* __R300_IOCTL_H__ */
+diff --git a/src/mesa/drivers/dri/r300/r300_mem.c b/src/mesa/drivers/dri/r300/r300_mem.c
+deleted file mode 100644
+index f8f9d4f..0000000
+--- a/src/mesa/drivers/dri/r300/r300_mem.c
++++ /dev/null
+@@ -1,385 +0,0 @@
+-/*
+- * Copyright (C) 2005 Aapo Tahkola.
+- *
+- * All Rights Reserved.
+- *
+- * Permission is hereby granted, free of charge, to any person obtaining
+- * a copy of this software and associated documentation files (the
+- * "Software"), to deal in the Software without restriction, including
+- * without limitation the rights to use, copy, modify, merge, publish,
+- * distribute, sublicense, and/or sell copies of the Software, and to
+- * permit persons to whom the Software is furnished to do so, subject to
+- * the following conditions:
+- *
+- * The above copyright notice and this permission notice (including the
+- * next paragraph) shall be included in all copies or substantial
+- * portions of the Software.
+- *
+- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+- * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+- * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+- * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+- * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+- *
+- */
+-
+-/**
+- * \file
+- *
+- * \author Aapo Tahkola <aet@rasterburn.org>
+- */
+-
+-#include <unistd.h>
+-
+-#include "r300_context.h"
+-#include "r300_cmdbuf.h"
+-#include "r300_ioctl.h"
+-#include "r300_mem.h"
+-#include "radeon_ioctl.h"
+-
+-#ifdef USER_BUFFERS
+-
+-static void resize_u_list(r300ContextPtr rmesa)
+-{
+-	void *temp;
+-	int nsize;
+-
+-	temp = rmesa->rmm->u_list;
+-	nsize = rmesa->rmm->u_size * 2;
+-
+-	rmesa->rmm->u_list = _mesa_malloc(nsize * sizeof(*rmesa->rmm->u_list));
+-	_mesa_memset(rmesa->rmm->u_list, 0,
+-		     nsize * sizeof(*rmesa->rmm->u_list));
+-
+-	if (temp) {
+-		r300FlushCmdBuf(rmesa, __FUNCTION__);
+-
+-		_mesa_memcpy(rmesa->rmm->u_list, temp,
+-			     rmesa->rmm->u_size * sizeof(*rmesa->rmm->u_list));
+-		_mesa_free(temp);
+-	}
+-
+-	rmesa->rmm->u_size = nsize;
+-}
+-
+-void r300_mem_init(r300ContextPtr rmesa)
+-{
+-	rmesa->rmm = malloc(sizeof(struct r300_memory_manager));
+-	memset(rmesa->rmm, 0, sizeof(struct r300_memory_manager));
+-
+-	rmesa->rmm->u_size = 128;
+-	resize_u_list(rmesa);
+-}
+-
+-void r300_mem_destroy(r300ContextPtr rmesa)
+-{
+-	_mesa_free(rmesa->rmm->u_list);
+-	rmesa->rmm->u_list = NULL;
+-
+-	_mesa_free(rmesa->rmm);
+-	rmesa->rmm = NULL;
+-}
+-
+-void *r300_mem_ptr(r300ContextPtr rmesa, int id)
+-{
+-	assert(id <= rmesa->rmm->u_last);
+-	return rmesa->rmm->u_list[id].ptr;
+-}
+-
+-int r300_mem_find(r300ContextPtr rmesa, void *ptr)
+-{
+-	int i;
+-
+-	for (i = 1; i < rmesa->rmm->u_size + 1; i++)
+-		if (rmesa->rmm->u_list[i].ptr &&
+-		    ptr >= rmesa->rmm->u_list[i].ptr &&
+-		    ptr <
+-		    rmesa->rmm->u_list[i].ptr + rmesa->rmm->u_list[i].size)
+-			break;
+-
+-	if (i < rmesa->rmm->u_size + 1)
+-		return i;
+-
+-	fprintf(stderr, "%p failed\n", ptr);
+-	return 0;
+-}
+-
+-//#define MM_DEBUG
+-int r300_mem_alloc(r300ContextPtr rmesa, int alignment, int size)
+-{
+-	drm_radeon_mem_alloc_t alloc;
+-	int offset = 0, ret;
+-	int i, free = -1;
+-	int done_age;
+-	drm_radeon_mem_free_t memfree;
+-	int tries = 0;
+-	static int bytes_wasted = 0, allocated = 0;
+-
+-	if (size < 4096)
+-		bytes_wasted += 4096 - size;
+-
+-	allocated += size;
+-
+-#if 0
+-	static int t = 0;
+-	if (t != time(NULL)) {
+-		t = time(NULL);
+-		fprintf(stderr, "slots used %d, wasted %d kb, allocated %d\n",
+-			rmesa->rmm->u_last, bytes_wasted / 1024,
+-			allocated / 1024);
+-	}
+-#endif
+-
+-	memfree.region = RADEON_MEM_REGION_GART;
+-
+-      again:
+-
+-	done_age = radeonGetAge((radeonContextPtr) rmesa);
+-
+-	if (rmesa->rmm->u_last + 1 >= rmesa->rmm->u_size)
+-		resize_u_list(rmesa);
+-
+-	for (i = rmesa->rmm->u_last + 1; i > 0; i--) {
+-		if (rmesa->rmm->u_list[i].ptr == NULL) {
+-			free = i;
+-			continue;
+-		}
+-
+-		if (rmesa->rmm->u_list[i].h_pending == 0 &&
+-		    rmesa->rmm->u_list[i].pending
+-		    && rmesa->rmm->u_list[i].age <= done_age) {
+-			memfree.region_offset =
+-			    (char *)rmesa->rmm->u_list[i].ptr -
+-			    (char *)rmesa->radeon.radeonScreen->gartTextures.
+-			    map;
+-
+-			ret =
+-			    drmCommandWrite(rmesa->radeon.radeonScreen->
+-					    driScreen->fd, DRM_RADEON_FREE,
+-					    &memfree, sizeof(memfree));
+-
+-			if (ret) {
+-				fprintf(stderr, "Failed to free at %p\n",
+-					rmesa->rmm->u_list[i].ptr);
+-				fprintf(stderr, "ret = %s\n", strerror(-ret));
+-				exit(1);
+-			} else {
+-#ifdef MM_DEBUG
+-				fprintf(stderr, "really freed %d at age %x\n",
+-					i,
+-					radeonGetAge((radeonContextPtr) rmesa));
+-#endif
+-				if (i == rmesa->rmm->u_last)
+-					rmesa->rmm->u_last--;
+-
+-				if (rmesa->rmm->u_list[i].size < 4096)
+-					bytes_wasted -=
+-					    4096 - rmesa->rmm->u_list[i].size;
+-
+-				allocated -= rmesa->rmm->u_list[i].size;
+-				rmesa->rmm->u_list[i].pending = 0;
+-				rmesa->rmm->u_list[i].ptr = NULL;
+-				free = i;
+-			}
+-		}
+-	}
+-	rmesa->rmm->u_head = i;
+-
+-	if (free == -1) {
+-		WARN_ONCE("Ran out of slots!\n");
+-		//usleep(100);
+-		r300FlushCmdBuf(rmesa, __FUNCTION__);
+-		tries++;
+-		if (tries > 100) {
+-			WARN_ONCE("Ran out of slots!\n");
+-			exit(1);
+-		}
+-		goto again;
+-	}
+-
+-	alloc.region = RADEON_MEM_REGION_GART;
+-	alloc.alignment = alignment;
+-	alloc.size = size;
+-	alloc.region_offset = &offset;
+-
+-	ret =
+-	    drmCommandWriteRead(rmesa->radeon.dri.fd, DRM_RADEON_ALLOC, &alloc,
+-				sizeof(alloc));
+-	if (ret) {
+-#if 0
+-		WARN_ONCE("Ran out of mem!\n");
+-		r300FlushCmdBuf(rmesa, __FUNCTION__);
+-		//usleep(100);
+-		tries2++;
+-		tries = 0;
+-		if (tries2 > 100) {
+-			WARN_ONCE("Ran out of GART memory!\n");
+-			exit(1);
+-		}
+-		goto again;
+-#else
+-		WARN_ONCE
+-		    ("Ran out of GART memory (for %d)!\nPlease consider adjusting GARTSize option.\n",
+-		     size);
+-		return 0;
+-#endif
+-	}
+-
+-	i = free;
+-
+-	if (i > rmesa->rmm->u_last)
+-		rmesa->rmm->u_last = i;
+-
+-	rmesa->rmm->u_list[i].ptr =
+-	    ((GLubyte *) rmesa->radeon.radeonScreen->gartTextures.map) + offset;
+-	rmesa->rmm->u_list[i].size = size;
+-	rmesa->rmm->u_list[i].age = 0;
+-	//fprintf(stderr, "alloc %p at id %d\n", rmesa->rmm->u_list[i].ptr, i);
+-
+-#ifdef MM_DEBUG
+-	fprintf(stderr, "allocated %d at age %x\n", i,
+-		radeonGetAge((radeonContextPtr) rmesa));
+-#endif
+-
+-	return i;
+-}
+-
+-void r300_mem_use(r300ContextPtr rmesa, int id)
+-{
+-	uint64_t ull;
+-#ifdef MM_DEBUG
+-	fprintf(stderr, "%s: %d at age %x\n", __FUNCTION__, id,
+-		radeonGetAge((radeonContextPtr) rmesa));
+-#endif
+-	drm_r300_cmd_header_t *cmd;
+-
+-	assert(id <= rmesa->rmm->u_last);
+-
+-	if (id == 0)
+-		return;
+-
+-	cmd =
+-	    (drm_r300_cmd_header_t *) r300AllocCmdBuf(rmesa,
+-						      2 + sizeof(ull) / 4,
+-						      __FUNCTION__);
+-	cmd[0].scratch.cmd_type = R300_CMD_SCRATCH;
+-	cmd[0].scratch.reg = R300_MEM_SCRATCH;
+-	cmd[0].scratch.n_bufs = 1;
+-	cmd[0].scratch.flags = 0;
+-	cmd++;
+-
+-	ull = (uint64_t) (intptr_t) & rmesa->rmm->u_list[id].age;
+-	_mesa_memcpy(cmd, &ull, sizeof(ull));
+-	cmd += sizeof(ull) / 4;
+-
+-	cmd[0].u = /*id */ 0;
+-
+-	LOCK_HARDWARE(&rmesa->radeon);	/* Protect from DRM. */
+-	rmesa->rmm->u_list[id].h_pending++;
+-	UNLOCK_HARDWARE(&rmesa->radeon);
+-}
+-
+-unsigned long r300_mem_offset(r300ContextPtr rmesa, int id)
+-{
+-	unsigned long offset;
+-
+-	assert(id <= rmesa->rmm->u_last);
+-
+-	offset = (char *)rmesa->rmm->u_list[id].ptr -
+-	    (char *)rmesa->radeon.radeonScreen->gartTextures.map;
+-	offset += rmesa->radeon.radeonScreen->gart_texture_offset;
+-
+-	return offset;
+-}
+-
+-void *r300_mem_map(r300ContextPtr rmesa, int id, int access)
+-{
+-#ifdef MM_DEBUG
+-	fprintf(stderr, "%s: %d at age %x\n", __FUNCTION__, id,
+-		radeonGetAge((radeonContextPtr) rmesa));
+-#endif
+-	void *ptr;
+-	int tries = 0;
+-
+-	assert(id <= rmesa->rmm->u_last);
+-
+-	if (access == R300_MEM_R) {
+-
+-		if (rmesa->rmm->u_list[id].mapped == 1)
+-			WARN_ONCE("buffer %d already mapped\n", id);
+-
+-		rmesa->rmm->u_list[id].mapped = 1;
+-		ptr = r300_mem_ptr(rmesa, id);
+-
+-		return ptr;
+-	}
+-
+-	if (rmesa->rmm->u_list[id].h_pending)
+-		r300FlushCmdBuf(rmesa, __FUNCTION__);
+-
+-	if (rmesa->rmm->u_list[id].h_pending) {
+-		return NULL;
+-	}
+-
+-	while (rmesa->rmm->u_list[id].age >
+-	       radeonGetAge((radeonContextPtr) rmesa) && tries++ < 1000)
+-		usleep(10);
+-
+-	if (tries >= 1000) {
+-		fprintf(stderr, "Idling failed (%x vs %x)\n",
+-			rmesa->rmm->u_list[id].age,
+-			radeonGetAge((radeonContextPtr) rmesa));
+-		return NULL;
+-	}
+-
+-	if (rmesa->rmm->u_list[id].mapped == 1)
+-		WARN_ONCE("buffer %d already mapped\n", id);
+-
+-	rmesa->rmm->u_list[id].mapped = 1;
+-	ptr = r300_mem_ptr(rmesa, id);
+-
+-	return ptr;
+-}
+-
+-void r300_mem_unmap(r300ContextPtr rmesa, int id)
+-{
+-#ifdef MM_DEBUG
+-	fprintf(stderr, "%s: %d at age %x\n", __FUNCTION__, id,
+-		radeonGetAge((radeonContextPtr) rmesa));
+-#endif
+-
+-	assert(id <= rmesa->rmm->u_last);
+-
+-	if (rmesa->rmm->u_list[id].mapped == 0)
+-		WARN_ONCE("buffer %d not mapped\n", id);
+-
+-	rmesa->rmm->u_list[id].mapped = 0;
+-}
+-
+-void r300_mem_free(r300ContextPtr rmesa, int id)
+-{
+-#ifdef MM_DEBUG
+-	fprintf(stderr, "%s: %d at age %x\n", __FUNCTION__, id,
+-		radeonGetAge((radeonContextPtr) rmesa));
+-#endif
+-
+-	assert(id <= rmesa->rmm->u_last);
+-
+-	if (id == 0)
+-		return;
+-
+-	if (rmesa->rmm->u_list[id].ptr == NULL) {
+-		WARN_ONCE("Not allocated!\n");
+-		return;
+-	}
+-
+-	if (rmesa->rmm->u_list[id].pending) {
+-		WARN_ONCE("%p already pended!\n", rmesa->rmm->u_list[id].ptr);
+-		return;
+-	}
+-
+-	rmesa->rmm->u_list[id].pending = 1;
+-}
+-#endif
+diff --git a/src/mesa/drivers/dri/r300/r300_mem.h b/src/mesa/drivers/dri/r300/r300_mem.h
+deleted file mode 100644
+index 625a7f6..0000000
+--- a/src/mesa/drivers/dri/r300/r300_mem.h
++++ /dev/null
+@@ -1,37 +0,0 @@
+-#ifndef __R300_MEM_H__
+-#define __R300_MEM_H__
+-
+-//#define R300_MEM_PDL 0
+-#define R300_MEM_UL 1
+-
+-#define R300_MEM_R 1
+-#define R300_MEM_W 2
+-#define R300_MEM_RW (R300_MEM_R | R300_MEM_W)
+-
+-#define R300_MEM_SCRATCH 2
+-
+-struct r300_memory_manager {
+-	struct {
+-		void *ptr;
+-		uint32_t size;
+-		uint32_t age;
+-		uint32_t h_pending;
+-		int pending;
+-		int mapped;
+-	} *u_list;
+-	int u_head, u_size, u_last;
+-
+-};
+-
+-extern void r300_mem_init(r300ContextPtr rmesa);
+-extern void r300_mem_destroy(r300ContextPtr rmesa);
+-extern void *r300_mem_ptr(r300ContextPtr rmesa, int id);
+-extern int r300_mem_find(r300ContextPtr rmesa, void *ptr);
+-extern int r300_mem_alloc(r300ContextPtr rmesa, int alignment, int size);
+-extern void r300_mem_use(r300ContextPtr rmesa, int id);
+-extern unsigned long r300_mem_offset(r300ContextPtr rmesa, int id);
+-extern void *r300_mem_map(r300ContextPtr rmesa, int id, int access);
+-extern void r300_mem_unmap(r300ContextPtr rmesa, int id);
+-extern void r300_mem_free(r300ContextPtr rmesa, int id);
+-
+-#endif
+diff --git a/src/mesa/drivers/dri/r300/r300_reg.h b/src/mesa/drivers/dri/r300/r300_reg.h
+index 7c6485e..5f344be 100644
+--- a/src/mesa/drivers/dri/r300/r300_reg.h
++++ b/src/mesa/drivers/dri/r300/r300_reg.h
+@@ -656,7 +656,7 @@ USE OR OTHER DEALINGS IN THE SOFTWARE.
+ #	define R300_GB_FOG_SELECT_C3A           (3 << 0)
+ #	define R300_GB_FOG_SELECT_1_1_W         (4 << 0)
+ #	define R300_GB_FOG_SELECT_Z		(5 << 0)
+-#	define R300_GB_DEPTH_SELECT_Z		(0 << 3
++#	define R300_GB_DEPTH_SELECT_Z		(0 << 3)
+ #	define R300_GB_DEPTH_SELECT_1_1_W	(1 << 3)
+ #	define R300_GB_W_SELECT_1_W		(0 << 4)
+ #	define R300_GB_W_SELECT_1		(1 << 4)
+@@ -730,8 +730,8 @@ USE OR OTHER DEALINGS IN THE SOFTWARE.
+ #define R500_RS_IP_TEX_PTR_Q_SHIFT 			18
+ #define R500_RS_IP_COL_PTR_SHIFT 			24
+ #define R500_RS_IP_COL_FMT_SHIFT 			27
+-#	define R500_RS_COL_PTR(x)		        (x << 24)
+-#       define R500_RS_COL_FMT(x)                       (x << 27)
++#	define R500_RS_COL_PTR(x)		        ((x) << 24)
++#       define R500_RS_COL_FMT(x)                       ((x) << 27)
+ /* gap */
+ #define R500_RS_IP_OFFSET_DIS 				(0 << 31)
+ #define R500_RS_IP_OFFSET_EN 				(1 << 31)
+@@ -1172,9 +1172,9 @@ USE OR OTHER DEALINGS IN THE SOFTWARE.
+ #define R300_RS_IP_3				        0x431C
+ #       define R300_RS_INTERP_SRC_SHIFT          2 /* TODO: check for removal */
+ #       define R300_RS_INTERP_SRC_MASK           (7 << 2) /* TODO: check for removal */
+-#	define R300_RS_TEX_PTR(x)		        (x << 0)
+-#	define R300_RS_COL_PTR(x)		        (x << 6)
+-#	define R300_RS_COL_FMT(x)		        (x << 9)
++#	define R300_RS_TEX_PTR(x)		        ((x) << 0)
++#	define R300_RS_COL_PTR(x)		        ((x) << 6)
++#	define R300_RS_COL_FMT(x)		        ((x) << 9)
+ #	define R300_RS_COL_FMT_RGBA		        0
+ #	define R300_RS_COL_FMT_RGB0		        1
+ #	define R300_RS_COL_FMT_RGB1		        2
+@@ -1184,10 +1184,10 @@ USE OR OTHER DEALINGS IN THE SOFTWARE.
+ #	define R300_RS_COL_FMT_111A		        8
+ #	define R300_RS_COL_FMT_1110		        9
+ #	define R300_RS_COL_FMT_1111		        10
+-#	define R300_RS_SEL_S(x)		                (x << 13)
+-#	define R300_RS_SEL_T(x)		                (x << 16)
+-#	define R300_RS_SEL_R(x)		                (x << 19)
+-#	define R300_RS_SEL_Q(x)		                (x << 22)
++#	define R300_RS_SEL_S(x)		                ((x) << 13)
++#	define R300_RS_SEL_T(x)		                ((x) << 16)
++#	define R300_RS_SEL_R(x)		                ((x) << 19)
++#	define R300_RS_SEL_Q(x)		                ((x) << 22)
+ #	define R300_RS_SEL_C0		                0
+ #	define R300_RS_SEL_C1		                1
+ #	define R300_RS_SEL_C2		                2
+@@ -1525,6 +1525,13 @@ USE OR OTHER DEALINGS IN THE SOFTWARE.
+ #	define R500_SEL_FILTER4_TC3		 (3 << 18)
+ 
+ #define R300_TX_OFFSET_0                    0x4540
++#define R300_TX_OFFSET_1                    0x4544
++#define R300_TX_OFFSET_2                    0x4548
++#define R300_TX_OFFSET_3                    0x454C
++#define R300_TX_OFFSET_4                    0x4550
++#define R300_TX_OFFSET_5                    0x4554
++#define R300_TX_OFFSET_6                    0x4558
++#define R300_TX_OFFSET_7                    0x455C
+ 	/* BEGIN: Guess from R200 */
+ #       define R300_TXO_ENDIAN_NO_SWAP           (0 << 0)
+ #       define R300_TXO_ENDIAN_BYTE_SWAP         (1 << 0)
+@@ -2705,7 +2712,7 @@ enum {
+ #   define R500_ALPHA_OP_COS				13
+ #   define R500_ALPHA_OP_MDH				14
+ #   define R500_ALPHA_OP_MDV				15
+-#   define R500_ALPHA_ADDRD(x)				(x << 4)
++#   define R500_ALPHA_ADDRD(x)				((x) << 4)
+ #   define R500_ALPHA_ADDRD_REL				(1 << 11)
+ #  define R500_ALPHA_SEL_A_SHIFT			12
+ #   define R500_ALPHA_SEL_A_SRC0			(0 << 12)
+@@ -2749,16 +2756,16 @@ enum {
+ #   define R500_ALPHA_OMOD_DIV_4			(5 << 26)
+ #   define R500_ALPHA_OMOD_DIV_8			(6 << 26)
+ #   define R500_ALPHA_OMOD_DISABLE			(7 << 26)
+-#   define R500_ALPHA_TARGET(x)				(x << 29)
++#   define R500_ALPHA_TARGET(x)				((x) << 29)
+ #   define R500_ALPHA_W_OMASK				(1 << 31)
+ #define R500_US_ALU_ALPHA_ADDR_0			0x9800
+-#   define R500_ALPHA_ADDR0(x)				(x << 0)
++#   define R500_ALPHA_ADDR0(x)				((x) << 0)
+ #   define R500_ALPHA_ADDR0_CONST			(1 << 8)
+ #   define R500_ALPHA_ADDR0_REL				(1 << 9)
+-#   define R500_ALPHA_ADDR1(x)				(x << 10)
++#   define R500_ALPHA_ADDR1(x)				((x) << 10)
+ #   define R500_ALPHA_ADDR1_CONST			(1 << 18)
+ #   define R500_ALPHA_ADDR1_REL				(1 << 19)
+-#   define R500_ALPHA_ADDR2(x)				(x << 20)
++#   define R500_ALPHA_ADDR2(x)				((x) << 20)
+ #   define R500_ALPHA_ADDR2_CONST			(1 << 28)
+ #   define R500_ALPHA_ADDR2_REL				(1 << 29)
+ #   define R500_ALPHA_SRCP_OP_1_MINUS_2A0		(0 << 30)
+@@ -2779,7 +2786,7 @@ enum {
+ #   define R500_ALU_RGBA_OP_SOP				(10 << 0)
+ #   define R500_ALU_RGBA_OP_MDH				(11 << 0)
+ #   define R500_ALU_RGBA_OP_MDV				(12 << 0)
+-#   define R500_ALU_RGBA_ADDRD(x)			(x << 4)
++#   define R500_ALU_RGBA_ADDRD(x)			((x) << 4)
+ #   define R500_ALU_RGBA_ADDRD_REL			(1 << 11)
+ #  define R500_ALU_RGBA_SEL_C_SHIFT			12
+ #   define R500_ALU_RGBA_SEL_C_SRC0			(0 << 12)
+@@ -2906,16 +2913,16 @@ enum {
+ #   define R500_ALU_RGB_OMOD_DIV_4			(5 << 26)
+ #   define R500_ALU_RGB_OMOD_DIV_8			(6 << 26)
+ #   define R500_ALU_RGB_OMOD_DISABLE			(7 << 26)
+-#   define R500_ALU_RGB_TARGET(x)			(x << 29)
++#   define R500_ALU_RGB_TARGET(x)			((x) << 29)
+ #   define R500_ALU_RGB_WMASK				(1 << 31)
+ #define R500_US_ALU_RGB_ADDR_0				0x9000
+-#   define R500_RGB_ADDR0(x)				(x << 0)
++#   define R500_RGB_ADDR0(x)				((x) << 0)
+ #   define R500_RGB_ADDR0_CONST				(1 << 8)
+ #   define R500_RGB_ADDR0_REL				(1 << 9)
+-#   define R500_RGB_ADDR1(x)				(x << 10)
++#   define R500_RGB_ADDR1(x)				((x) << 10)
+ #   define R500_RGB_ADDR1_CONST				(1 << 18)
+ #   define R500_RGB_ADDR1_REL				(1 << 19)
+-#   define R500_RGB_ADDR2(x)				(x << 20)
++#   define R500_RGB_ADDR2(x)				((x) << 20)
+ #   define R500_RGB_ADDR2_CONST				(1 << 28)
+ #   define R500_RGB_ADDR2_REL				(1 << 29)
+ #   define R500_RGB_SRCP_OP_1_MINUS_2RGB0		(0 << 30)
+@@ -2970,19 +2977,19 @@ enum {
+ 
+ /* note that these are 8 bit lengths, despite the offsets, at least for R500 */
+ #define R500_US_CODE_ADDR				0x4630
+-#   define R500_US_CODE_START_ADDR(x)			(x << 0)
+-#   define R500_US_CODE_END_ADDR(x)			(x << 16)
++#   define R500_US_CODE_START_ADDR(x)			((x) << 0)
++#   define R500_US_CODE_END_ADDR(x)			((x) << 16)
+ #define R500_US_CODE_OFFSET				0x4638
+-#   define R500_US_CODE_OFFSET_ADDR(x)			(x << 0)
++#   define R500_US_CODE_OFFSET_ADDR(x)			((x) << 0)
+ #define R500_US_CODE_RANGE				0x4634
+-#   define R500_US_CODE_RANGE_ADDR(x)			(x << 0)
+-#   define R500_US_CODE_RANGE_SIZE(x)			(x << 16)
++#   define R500_US_CODE_RANGE_ADDR(x)			((x) << 0)
++#   define R500_US_CODE_RANGE_SIZE(x)			((x) << 16)
+ #define R500_US_CONFIG					0x4600
+ #   define R500_ZERO_TIMES_ANYTHING_EQUALS_ZERO		(1 << 1)
+ #define R500_US_FC_ADDR_0				0xa000
+-#   define R500_FC_BOOL_ADDR(x)				(x << 0)
+-#   define R500_FC_INT_ADDR(x)				(x << 8)
+-#   define R500_FC_JUMP_ADDR(x)				(x << 16)
++#   define R500_FC_BOOL_ADDR(x)				((x) << 0)
++#   define R500_FC_INT_ADDR(x)				((x) << 8)
++#   define R500_FC_JUMP_ADDR(x)				((x) << 16)
+ #   define R500_FC_JUMP_GLOBAL				(1 << 31)
+ #define R500_US_FC_BOOL_CONST				0x4620
+ #   define R500_FC_KBOOL(x)				(x)
+@@ -3003,8 +3010,8 @@ enum {
+ #   define R500_FC_A_OP_NONE				(0 << 6)
+ #   define R500_FC_A_OP_POP				(1 << 6)
+ #   define R500_FC_A_OP_PUSH				(2 << 6)
+-#   define R500_FC_JUMP_FUNC(x)				(x << 8)
+-#   define R500_FC_B_POP_CNT(x)				(x << 16)
++#   define R500_FC_JUMP_FUNC(x)				((x) << 8)
++#   define R500_FC_B_POP_CNT(x)				((x) << 16)
+ #   define R500_FC_B_OP0_NONE				(0 << 24)
+ #   define R500_FC_B_OP0_DECR				(1 << 24)
+ #   define R500_FC_B_OP0_INCR				(2 << 24)
+@@ -3013,14 +3020,14 @@ enum {
+ #   define R500_FC_B_OP1_INCR				(2 << 26)
+ #   define R500_FC_IGNORE_UNCOVERED			(1 << 28)
+ #define R500_US_FC_INT_CONST_0				0x4c00
+-#   define R500_FC_INT_CONST_KR(x)			(x << 0)
+-#   define R500_FC_INT_CONST_KG(x)			(x << 8)
+-#   define R500_FC_INT_CONST_KB(x)			(x << 16)
++#   define R500_FC_INT_CONST_KR(x)			((x) << 0)
++#   define R500_FC_INT_CONST_KG(x)			((x) << 8)
++#   define R500_FC_INT_CONST_KB(x)			((x) << 16)
+ /* _0 through _15 */
+ #define R500_US_FORMAT0_0				0x4640
+-#   define R500_FORMAT_TXWIDTH(x)			(x << 0)
+-#   define R500_FORMAT_TXHEIGHT(x)			(x << 11)
+-#   define R500_FORMAT_TXDEPTH(x)			(x << 22)
++#   define R500_FORMAT_TXWIDTH(x)			((x) << 0)
++#   define R500_FORMAT_TXHEIGHT(x)			((x) << 11)
++#   define R500_FORMAT_TXDEPTH(x)			((x) << 22)
+ /* _0 through _3 */
+ #define R500_US_OUT_FMT_0				0x46a4
+ #   define R500_OUT_FMT_C4_8				(0 << 0)
+@@ -3061,12 +3068,12 @@ enum {
+ #   define R500_C3_SEL_R				(1 << 14)
+ #   define R500_C3_SEL_G				(2 << 14)
+ #   define R500_C3_SEL_B				(3 << 14)
+-#   define R500_OUT_SIGN(x)				(x << 16)
++#   define R500_OUT_SIGN(x)				((x) << 16)
+ #   define R500_ROUND_ADJ				(1 << 20)
+ #define R500_US_PIXSIZE					0x4604
+ #   define R500_PIX_SIZE(x)				(x)
+ #define R500_US_TEX_ADDR_0				0x9800
+-#   define R500_TEX_SRC_ADDR(x)				(x << 0)
++#   define R500_TEX_SRC_ADDR(x)				((x) << 0)
+ #   define R500_TEX_SRC_ADDR_REL			(1 << 7)
+ #   define R500_TEX_SRC_S_SWIZ_R			(0 << 8)
+ #   define R500_TEX_SRC_S_SWIZ_G			(1 << 8)
+@@ -3084,7 +3091,7 @@ enum {
+ #   define R500_TEX_SRC_Q_SWIZ_G			(1 << 14)
+ #   define R500_TEX_SRC_Q_SWIZ_B			(2 << 14)
+ #   define R500_TEX_SRC_Q_SWIZ_A			(3 << 14)
+-#   define R500_TEX_DST_ADDR(x)				(x << 16)
++#   define R500_TEX_DST_ADDR(x)				((x) << 16)
+ #   define R500_TEX_DST_ADDR_REL			(1 << 23)
+ #   define R500_TEX_DST_R_SWIZ_R			(0 << 24)
+ #   define R500_TEX_DST_R_SWIZ_G			(1 << 24)
+@@ -3103,7 +3110,7 @@ enum {
+ #   define R500_TEX_DST_A_SWIZ_B			(2 << 30)
+ #   define R500_TEX_DST_A_SWIZ_A			(3 << 30)
+ #define R500_US_TEX_ADDR_DXDY_0				0xa000
+-#   define R500_DX_ADDR(x)				(x << 0)
++#   define R500_DX_ADDR(x)				((x) << 0)
+ #   define R500_DX_ADDR_REL				(1 << 7)
+ #   define R500_DX_S_SWIZ_R				(0 << 8)
+ #   define R500_DX_S_SWIZ_G				(1 << 8)
+@@ -3121,7 +3128,7 @@ enum {
+ #   define R500_DX_Q_SWIZ_G				(1 << 14)
+ #   define R500_DX_Q_SWIZ_B				(2 << 14)
+ #   define R500_DX_Q_SWIZ_A				(3 << 14)
+-#   define R500_DY_ADDR(x)				(x << 16)
++#   define R500_DY_ADDR(x)				((x) << 16)
+ #   define R500_DY_ADDR_REL				(1 << 17)
+ #   define R500_DY_S_SWIZ_R				(0 << 24)
+ #   define R500_DY_S_SWIZ_G				(1 << 24)
+@@ -3140,7 +3147,7 @@ enum {
+ #   define R500_DY_Q_SWIZ_B				(2 << 30)
+ #   define R500_DY_Q_SWIZ_A				(3 << 30)
+ #define R500_US_TEX_INST_0				0x9000
+-#   define R500_TEX_ID(x)				(x << 16)
++#   define R500_TEX_ID(x)				((x) << 16)
+ #   define R500_TEX_INST_NOP				(0 << 22)
+ #   define R500_TEX_INST_LD				(1 << 22)
+ #   define R500_TEX_INST_TEXKILL			(2 << 22)
+diff --git a/src/mesa/drivers/dri/r300/r300_render.c b/src/mesa/drivers/dri/r300/r300_render.c
+index f9266e4..f46477f 100644
+--- a/src/mesa/drivers/dri/r300/r300_render.c
++++ b/src/mesa/drivers/dri/r300/r300_render.c
+@@ -66,8 +66,6 @@ USE OR OTHER DEALINGS IN THE SOFTWARE.
+ #include "tnl/t_vp_build.h"
+ #include "radeon_reg.h"
+ #include "radeon_macros.h"
+-#include "radeon_ioctl.h"
+-#include "radeon_state.h"
+ #include "r300_context.h"
+ #include "r300_ioctl.h"
+ #include "r300_state.h"
+@@ -175,85 +173,164 @@ int r300NumVerts(r300ContextPtr rmesa, int num_verts, int prim)
+ static void r300EmitElts(GLcontext * ctx, void *elts, unsigned long n_elts)
+ {
+ 	r300ContextPtr rmesa = R300_CONTEXT(ctx);
+-	struct r300_dma_region *rvb = &rmesa->state.elt_dma;
+ 	void *out;
+ 
+-	if (r300IsGartMemory(rmesa, elts, n_elts * 4)) {
+-		rvb->address = rmesa->radeon.radeonScreen->gartTextures.map;
+-		rvb->start = ((char *)elts) - rvb->address;
+-		rvb->aos_offset =
+-		    rmesa->radeon.radeonScreen->gart_texture_offset +
+-		    rvb->start;
+-		return;
+-	} else if (r300IsGartMemory(rmesa, elts, 1)) {
+-		WARN_ONCE("Pointer not within GART memory!\n");
+-		_mesa_exit(-1);
+-	}
+-
+-	r300AllocDmaRegion(rmesa, rvb, n_elts * 4, 4);
+-	rvb->aos_offset = GET_START(rvb);
+-
+-	out = rvb->address + rvb->start;
++	radeonAllocDmaRegion(&rmesa->radeon, &rmesa->state.elt_dma_bo,
++			     &rmesa->state.elt_dma_offset, n_elts * 4, 4);
++	radeon_bo_map(rmesa->state.elt_dma_bo, 1);
++	out = rmesa->state.elt_dma_bo->ptr + rmesa->state.elt_dma_offset;
+ 	memcpy(out, elts, n_elts * 4);
++	radeon_bo_unmap(rmesa->state.elt_dma_bo);
+ }
+ 
+-static void r300FireEB(r300ContextPtr rmesa, unsigned long addr,
+-		       int vertex_count, int type)
++static void r300FireEB(r300ContextPtr rmesa, int vertex_count, int type)
+ {
+-	int cmd_reserved = 0;
+-	int cmd_written = 0;
+-	drm_radeon_cmd_header_t *cmd = NULL;
+-
+-	start_packet3(CP_PACKET3(R300_PACKET3_3D_DRAW_INDX_2, 0), 0);
+-	e32(R300_VAP_VF_CNTL__PRIM_WALK_INDICES | (vertex_count << 16) | type | R300_VAP_VF_CNTL__INDEX_SIZE_32bit);
+-
+-	start_packet3(CP_PACKET3(R300_PACKET3_INDX_BUFFER, 2), 2);
+-	e32(R300_INDX_BUFFER_ONE_REG_WR | (0 << R300_INDX_BUFFER_SKIP_SHIFT) |
+-	    (R300_VAP_PORT_IDX0 >> 2));
+-	e32(addr);
+-	e32(vertex_count);
++	BATCH_LOCALS(&rmesa->radeon);
++
++	if (vertex_count > 0) {
++		BEGIN_BATCH(10);
++		OUT_BATCH_PACKET3(R300_PACKET3_3D_DRAW_INDX_2, 0);
++		OUT_BATCH(R300_VAP_VF_CNTL__PRIM_WALK_INDICES |
++			  ((vertex_count + 0) << 16) |
++			  type |
++			  R300_VAP_VF_CNTL__INDEX_SIZE_32bit);
++		
++		if (!rmesa->radeon.radeonScreen->kernel_mm) {
++			OUT_BATCH_PACKET3(R300_PACKET3_INDX_BUFFER, 2);
++			OUT_BATCH(R300_INDX_BUFFER_ONE_REG_WR | (0 << R300_INDX_BUFFER_SKIP_SHIFT) |
++	    			 (R300_VAP_PORT_IDX0 >> 2));
++			OUT_BATCH_RELOC(rmesa->state.elt_dma_offset,
++					rmesa->state.elt_dma_bo,
++					rmesa->state.elt_dma_offset,
++					RADEON_GEM_DOMAIN_GTT, 0, 0);
++			OUT_BATCH(vertex_count);
++		} else {
++			OUT_BATCH_PACKET3(R300_PACKET3_INDX_BUFFER, 2);
++			OUT_BATCH(R300_INDX_BUFFER_ONE_REG_WR | (0 << R300_INDX_BUFFER_SKIP_SHIFT) |
++	    			 (R300_VAP_PORT_IDX0 >> 2));
++			OUT_BATCH(rmesa->state.elt_dma_offset);
++			OUT_BATCH(vertex_count);
++			radeon_cs_write_reloc(rmesa->radeon.cmdbuf.cs,
++					      rmesa->state.elt_dma_bo,
++					      RADEON_GEM_DOMAIN_GTT, 0, 0);
++		}
++		END_BATCH();
++	}
+ }
+ 
+ static void r300EmitAOS(r300ContextPtr rmesa, GLuint nr, GLuint offset)
+ {
++	BATCH_LOCALS(&rmesa->radeon);
++	uint32_t voffset;
+ 	int sz = 1 + (nr >> 1) * 3 + (nr & 1) * 2;
+ 	int i;
+-	int cmd_reserved = 0;
+-	int cmd_written = 0;
+-	drm_radeon_cmd_header_t *cmd = NULL;
+-
++	
+ 	if (RADEON_DEBUG & DEBUG_VERTS)
+ 		fprintf(stderr, "%s: nr=%d, ofs=0x%08x\n", __FUNCTION__, nr,
+ 			offset);
+ 
+-	start_packet3(CP_PACKET3(R300_PACKET3_3D_LOAD_VBPNTR, sz - 1), sz - 1);
+-	e32(nr);
+-
+-	for (i = 0; i + 1 < nr; i += 2) {
+-		e32((rmesa->state.aos[i].aos_size << 0) |
+-		    (rmesa->state.aos[i].aos_stride << 8) |
+-		    (rmesa->state.aos[i + 1].aos_size << 16) |
+-		    (rmesa->state.aos[i + 1].aos_stride << 24));
++    
++	if (!rmesa->radeon.radeonScreen->kernel_mm) {
++		BEGIN_BATCH(sz+2+(nr * 2));
++		OUT_BATCH_PACKET3(R300_PACKET3_3D_LOAD_VBPNTR, sz - 1);
++		OUT_BATCH(nr);
++
++		for (i = 0; i + 1 < nr; i += 2) {
++			OUT_BATCH((rmesa->state.aos[i].components << 0) |
++				  (rmesa->state.aos[i].stride << 8) |
++				  (rmesa->state.aos[i + 1].components << 16) |
++				  (rmesa->state.aos[i + 1].stride << 24));
++			
++			voffset =  rmesa->state.aos[i + 0].offset +
++				offset * 4 * rmesa->state.aos[i + 0].stride;
++			OUT_BATCH_RELOC(voffset,
++					rmesa->state.aos[i].bo,
++					voffset,
++					RADEON_GEM_DOMAIN_GTT,
++					0, 0);
++			voffset =  rmesa->state.aos[i + 1].offset +
++			  offset * 4 * rmesa->state.aos[i + 1].stride;
++			OUT_BATCH_RELOC(voffset,
++					rmesa->state.aos[i+1].bo,
++					voffset,
++					RADEON_GEM_DOMAIN_GTT,
++					0, 0);
++		}
++		
++		if (nr & 1) {
++			OUT_BATCH((rmesa->state.aos[nr - 1].components << 0) |
++				  (rmesa->state.aos[nr - 1].stride << 8));
++			voffset =  rmesa->state.aos[nr - 1].offset +
++				offset * 4 * rmesa->state.aos[nr - 1].stride;
++			OUT_BATCH_RELOC(voffset,
++					rmesa->state.aos[nr - 1].bo,
++					voffset,
++					RADEON_GEM_DOMAIN_GTT,
++					0, 0);
++		}
++		END_BATCH();
++	} else {
+ 
+-		e32(rmesa->state.aos[i].aos_offset + offset * 4 * rmesa->state.aos[i].aos_stride);
+-		e32(rmesa->state.aos[i + 1].aos_offset + offset * 4 * rmesa->state.aos[i + 1].aos_stride);
++		BEGIN_BATCH(sz+2+(nr * 2));
++		OUT_BATCH_PACKET3(R300_PACKET3_3D_LOAD_VBPNTR, sz - 1);
++		OUT_BATCH(nr);
++
++		for (i = 0; i + 1 < nr; i += 2) {
++			OUT_BATCH((rmesa->state.aos[i].components << 0) |
++				  (rmesa->state.aos[i].stride << 8) |
++				  (rmesa->state.aos[i + 1].components << 16) |
++				  (rmesa->state.aos[i + 1].stride << 24));
++			
++			voffset =  rmesa->state.aos[i + 0].offset +
++				offset * 4 * rmesa->state.aos[i + 0].stride;
++			OUT_BATCH(voffset);
++			voffset =  rmesa->state.aos[i + 1].offset +
++				offset * 4 * rmesa->state.aos[i + 1].stride;
++			OUT_BATCH(voffset);
++		}
++		
++		if (nr & 1) {
++			OUT_BATCH((rmesa->state.aos[nr - 1].components << 0) |
++			  (rmesa->state.aos[nr - 1].stride << 8));
++			voffset =  rmesa->state.aos[nr - 1].offset +
++				offset * 4 * rmesa->state.aos[nr - 1].stride;
++			OUT_BATCH(voffset);
++		}
++		for (i = 0; i + 1 < nr; i += 2) {
++			voffset =  rmesa->state.aos[i + 0].offset +
++				offset * 4 * rmesa->state.aos[i + 0].stride;
++			radeon_cs_write_reloc(rmesa->radeon.cmdbuf.cs,
++					      rmesa->state.aos[i+0].bo,
++					      RADEON_GEM_DOMAIN_GTT,
++					      0, 0);
++			voffset =  rmesa->state.aos[i + 1].offset +
++				offset * 4 * rmesa->state.aos[i + 1].stride;
++			radeon_cs_write_reloc(rmesa->radeon.cmdbuf.cs,
++					      rmesa->state.aos[i+1].bo,
++					      RADEON_GEM_DOMAIN_GTT,
++					      0, 0);
++		}
++		if (nr & 1) {
++			voffset =  rmesa->state.aos[nr - 1].offset +
++				offset * 4 * rmesa->state.aos[nr - 1].stride;
++			radeon_cs_write_reloc(rmesa->radeon.cmdbuf.cs,
++					      rmesa->state.aos[nr-1].bo,
++					      RADEON_GEM_DOMAIN_GTT,
++					      0, 0);
++		}
++		END_BATCH();
+ 	}
+ 
+-	if (nr & 1) {
+-		e32((rmesa->state.aos[nr - 1].aos_size << 0) |
+-		    (rmesa->state.aos[nr - 1].aos_stride << 8));
+-		e32(rmesa->state.aos[nr - 1].aos_offset + offset * 4 * rmesa->state.aos[nr - 1].aos_stride);
+-	}
+ }
+ 
+ static void r300FireAOS(r300ContextPtr rmesa, int vertex_count, int type)
+ {
+-	int cmd_reserved = 0;
+-	int cmd_written = 0;
+-	drm_radeon_cmd_header_t *cmd = NULL;
++	BATCH_LOCALS(&rmesa->radeon);
+ 
+-	start_packet3(CP_PACKET3(R300_PACKET3_3D_DRAW_VBUF_2, 0), 0);
+-	e32(R300_VAP_VF_CNTL__PRIM_WALK_VERTEX_LIST | (vertex_count << 16) | type);
++	BEGIN_BATCH(3);
++	OUT_BATCH_PACKET3(R300_PACKET3_3D_DRAW_VBUF_2, 0);
++	OUT_BATCH(R300_VAP_VF_CNTL__PRIM_WALK_VERTEX_LIST | (vertex_count << 16) | type);
++	END_BATCH();
+ }
+ 
+ static void r300RunRenderPrimitive(r300ContextPtr rmesa, GLcontext * ctx,
+@@ -269,6 +346,12 @@ static void r300RunRenderPrimitive(r300ContextPtr rmesa, GLcontext * ctx,
+ 	if (type < 0 || num_verts <= 0)
+ 		return;
+ 
++	/* Make space for at least 64 dwords.
++	 * This is supposed to ensure that we can get all rendering
++	 * commands into a single command buffer.
++	 */
++	rcommonEnsureCmdBufSpace(&rmesa->radeon, 64, __FUNCTION__);
++
+ 	if (vb->Elts) {
+ 		if (num_verts > 65535) {
+ 			/* not implemented yet */
+@@ -288,11 +371,12 @@ static void r300RunRenderPrimitive(r300ContextPtr rmesa, GLcontext * ctx,
+ 		 */
+ 		r300EmitElts(ctx, vb->Elts, num_verts);
+ 		r300EmitAOS(rmesa, rmesa->state.aos_count, start);
+-		r300FireEB(rmesa, rmesa->state.elt_dma.aos_offset, num_verts, type);
++		r300FireEB(rmesa, num_verts, type);
+ 	} else {
+ 		r300EmitAOS(rmesa, rmesa->state.aos_count, start);
+ 		r300FireAOS(rmesa, num_verts, type);
+ 	}
++	COMMIT_BATCH();
+ }
+ 
+ static GLboolean r300RunRender(GLcontext * ctx,
+@@ -303,7 +387,6 @@ static GLboolean r300RunRender(GLcontext * ctx,
+ 	TNLcontext *tnl = TNL_CONTEXT(ctx);
+ 	struct vertex_buffer *vb = &tnl->vb;
+ 
+-
+ 	if (RADEON_DEBUG & DEBUG_PRIMS)
+ 		fprintf(stderr, "%s\n", __FUNCTION__);
+ 
+@@ -314,7 +397,7 @@ static GLboolean r300RunRender(GLcontext * ctx,
+ 	r300UpdateShaderStates(rmesa);
+ 
+ 	r300EmitCacheFlush(rmesa);
+-	r300EmitState(rmesa);
++	radeonEmitState(&rmesa->radeon);
+ 
+ 	for (i = 0; i < vb->PrimitiveCount; i++) {
+ 		GLuint prim = _tnl_translate_prim(&vb->Primitive[i]);
+@@ -325,10 +408,6 @@ static GLboolean r300RunRender(GLcontext * ctx,
+ 
+ 	r300EmitCacheFlush(rmesa);
+ 
+-#ifdef USER_BUFFERS
+-	r300UseArrays(ctx);
+-#endif
+-
+ 	r300ReleaseArrays(ctx);
+ 
+ 	return GL_FALSE;
+@@ -347,6 +426,8 @@ static GLboolean r300RunRender(GLcontext * ctx,
+ static int r300Fallback(GLcontext * ctx)
+ {
+ 	r300ContextPtr r300 = R300_CONTEXT(ctx);
++	const unsigned back = ctx->Stencil._BackFace;
++
+ 	/* Do we need to use new-style shaders?
+ 	 * Also is there a better way to do this? */
+ 	if (r300->radeon.radeonScreen->chip_family >= CHIP_FAMILY_RV515) {
+@@ -371,12 +452,14 @@ static int r300Fallback(GLcontext * ctx)
+ 
+ 	FALLBACK_IF(ctx->RenderMode != GL_RENDER);
+ 
+-	FALLBACK_IF(ctx->Stencil._TestTwoSide
+-		    && (ctx->Stencil.Ref[0] != ctx->Stencil.Ref[1]
+-			|| ctx->Stencil.ValueMask[0] !=
+-			ctx->Stencil.ValueMask[1]
+-			|| ctx->Stencil.WriteMask[0] !=
+-			ctx->Stencil.WriteMask[1]));
++	/* If GL_EXT_stencil_two_side is disabled, this fallback check can
++	 * be removed.
++	 */
++	FALLBACK_IF(ctx->Stencil.Ref[0] != ctx->Stencil.Ref[back]
++		    || ctx->Stencil.ValueMask[0] !=
++		    ctx->Stencil.ValueMask[back]
++		    || ctx->Stencil.WriteMask[0] !=
++		    ctx->Stencil.WriteMask[back]);
+ 
+ 	if (ctx->Extensions.NV_point_sprite || ctx->Extensions.ARB_point_sprite)
+ 		FALLBACK_IF(ctx->Point.PointSprite);
+@@ -428,6 +511,9 @@ static GLboolean r300RunTCLRender(GLcontext * ctx,
+ 		return GL_TRUE;
+ 	}
+ 
++	if (!r300ValidateBuffers(ctx))
++	    return GL_TRUE;
++	
+ 	r300UpdateShaders(rmesa);
+ 
+ 	vp = (struct r300_vertex_program *)CURRENT_VERTEX_SHADER(ctx);
+diff --git a/src/mesa/drivers/dri/r300/r300_state.c b/src/mesa/drivers/dri/r300/r300_state.c
+index a63dbac..59728a0 100644
+--- a/src/mesa/drivers/dri/r300/r300_state.c
++++ b/src/mesa/drivers/dri/r300/r300_state.c
+@@ -53,8 +53,6 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ #include "vbo/vbo.h"
+ #include "tnl/tnl.h"
+ 
+-#include "radeon_ioctl.h"
+-#include "radeon_state.h"
+ #include "r300_context.h"
+ #include "r300_ioctl.h"
+ #include "r300_state.h"
+@@ -590,7 +588,7 @@ static void r300SetStencilState(GLcontext * ctx, GLboolean state)
+ {
+ 	r300ContextPtr r300 = R300_CONTEXT(ctx);
+ 
+-	if (r300->state.stencil.hw_stencil) {
++	if (r300->radeon.state.stencil.hwBuffer) {
+ 		R300_STATECHANGE(r300, zs);
+ 		if (state) {
+ 			r300->hw.zs.cmd[R300_ZS_CNTL_0] |=
+@@ -783,6 +781,7 @@ static void r300Fogfv(GLcontext * ctx, GLenum pname, const GLfloat * param)
+ 			    R300_FG_FOG_BLEND_FN_EXP2;
+ 			fogScale.f = 0.3 * ctx->Fog.Density;
+ 			fogStart.f = 0.0;
++                        break;
+ 		default:
+ 			return;
+ 		}
+@@ -974,15 +973,9 @@ static void r300StencilFuncSeparate(GLcontext * ctx, GLenum face,
+ {
+ 	r300ContextPtr rmesa = R300_CONTEXT(ctx);
+ 	GLuint refmask =
+-	    (((ctx->Stencil.
+-	       Ref[0] & 0xff) << R300_STENCILREF_SHIFT) | ((ctx->
+-							    Stencil.
+-							    ValueMask
+-							    [0] &
+-							    0xff)
+-							   <<
+-							   R300_STENCILMASK_SHIFT));
+-
++	    ((ctx->Stencil.Ref[0] & 0xff) << R300_STENCILREF_SHIFT)
++	     | ((ctx->Stencil.ValueMask[0] & 0xff) << R300_STENCILMASK_SHIFT);
++	const unsigned back = ctx->Stencil._BackFace;
+ 	GLuint flag;
+ 
+ 	R300_STATECHANGE(rmesa, zs);
+@@ -1000,8 +993,7 @@ static void r300StencilFuncSeparate(GLcontext * ctx, GLenum face,
+ 	rmesa->hw.zs.cmd[R300_ZS_CNTL_1] |=
+ 	    (flag << R300_S_FRONT_FUNC_SHIFT);
+ 
+-	if (ctx->Stencil._TestTwoSide)
+-		flag = translate_func(ctx->Stencil.Function[1]);
++	flag = translate_func(ctx->Stencil.Function[back]);
+ 
+ 	rmesa->hw.zs.cmd[R300_ZS_CNTL_1] |=
+ 	    (flag << R300_S_BACK_FUNC_SHIFT);
+@@ -1026,6 +1018,7 @@ static void r300StencilOpSeparate(GLcontext * ctx, GLenum face,
+ 				  GLenum fail, GLenum zfail, GLenum zpass)
+ {
+ 	r300ContextPtr rmesa = R300_CONTEXT(ctx);
++	const unsigned back = ctx->Stencil._BackFace;
+ 
+ 	R300_STATECHANGE(rmesa, zs);
+ 	/* It is easier to mask what's left.. */
+@@ -1042,23 +1035,13 @@ static void r300StencilOpSeparate(GLcontext * ctx, GLenum face,
+ 	    | (translate_stencil_op(ctx->Stencil.ZPassFunc[0]) <<
+ 	       R300_S_FRONT_ZPASS_OP_SHIFT);
+ 
+-	if (ctx->Stencil._TestTwoSide) {
+-		rmesa->hw.zs.cmd[R300_ZS_CNTL_1] |=
+-		    (translate_stencil_op(ctx->Stencil.FailFunc[1]) <<
+-		     R300_S_BACK_SFAIL_OP_SHIFT)
+-		    | (translate_stencil_op(ctx->Stencil.ZFailFunc[1]) <<
+-		       R300_S_BACK_ZFAIL_OP_SHIFT)
+-		    | (translate_stencil_op(ctx->Stencil.ZPassFunc[1]) <<
+-		       R300_S_BACK_ZPASS_OP_SHIFT);
+-	} else {
+-		rmesa->hw.zs.cmd[R300_ZS_CNTL_1] |=
+-		    (translate_stencil_op(ctx->Stencil.FailFunc[0]) <<
+-		     R300_S_BACK_SFAIL_OP_SHIFT)
+-		    | (translate_stencil_op(ctx->Stencil.ZFailFunc[0]) <<
+-		       R300_S_BACK_ZFAIL_OP_SHIFT)
+-		    | (translate_stencil_op(ctx->Stencil.ZPassFunc[0]) <<
+-		       R300_S_BACK_ZPASS_OP_SHIFT);
+-	}
++	rmesa->hw.zs.cmd[R300_ZS_CNTL_1] |=
++	    (translate_stencil_op(ctx->Stencil.FailFunc[back]) <<
++	     R300_S_BACK_SFAIL_OP_SHIFT)
++	    | (translate_stencil_op(ctx->Stencil.ZFailFunc[back]) <<
++	       R300_S_BACK_ZFAIL_OP_SHIFT)
++	    | (translate_stencil_op(ctx->Stencil.ZPassFunc[back]) <<
++	       R300_S_BACK_ZPASS_OP_SHIFT);
+ }
+ 
+ /* =============================================================
+@@ -1083,10 +1066,10 @@ static void r300UpdateWindow(GLcontext * ctx)
+ 	GLfloat tx = v[MAT_TX] + xoffset + SUBPIXEL_X;
+ 	GLfloat sy = -v[MAT_SY];
+ 	GLfloat ty = (-v[MAT_TY]) + yoffset + SUBPIXEL_Y;
+-	GLfloat sz = v[MAT_SZ] * rmesa->state.depth.scale;
+-	GLfloat tz = v[MAT_TZ] * rmesa->state.depth.scale;
++	GLfloat sz = v[MAT_SZ] * rmesa->radeon.state.depth.scale;
++	GLfloat tz = v[MAT_TZ] * rmesa->radeon.state.depth.scale;
+ 
+-	R300_FIREVERTICES(rmesa);
++	radeon_firevertices(&rmesa->radeon);
+ 	R300_STATECHANGE(rmesa, vpt);
+ 
+ 	rmesa->hw.vpt.cmd[R300_VPT_XSCALE] = r300PackFloat32(sx);
+@@ -1100,10 +1083,19 @@ static void r300UpdateWindow(GLcontext * ctx)
+ static void r300Viewport(GLcontext * ctx, GLint x, GLint y,
+ 			 GLsizei width, GLsizei height)
+ {
++	r300ContextPtr rmesa = R300_CONTEXT(ctx);
++    __DRIcontext *driContext = rmesa->radeon.dri.context;
+ 	/* Don't pipeline viewport changes, conflict with window offset
+ 	 * setting below.  Could apply deltas to rescue pipelined viewport
+ 	 * values, or keep the originals hanging around.
+ 	 */
++    if (rmesa->radeon.radeonScreen->driScreen->dri2.enabled) {
++        radeon_update_renderbuffers(driContext, driContext->driDrawablePriv);
++        if (driContext->driDrawablePriv != driContext->driReadablePriv) {
++            radeon_update_renderbuffers(driContext,
++                                        driContext->driReadablePriv);
++        }
++    }
+ 	r300UpdateWindow(ctx);
+ }
+ 
+@@ -1144,55 +1136,25 @@ void r300UpdateViewportOffset(GLcontext * ctx)
+ void r300UpdateDrawBuffer(GLcontext * ctx)
+ {
+ 	r300ContextPtr rmesa = R300_CONTEXT(ctx);
+-	r300ContextPtr r300 = rmesa;
+ 	struct gl_framebuffer *fb = ctx->DrawBuffer;
+-	driRenderbuffer *drb;
++	struct radeon_renderbuffer *rrb;
+ 
+ 	if (fb->_ColorDrawBufferIndexes[0] == BUFFER_FRONT_LEFT) {
+ 		/* draw to front */
+-		drb =
+-		    (driRenderbuffer *) fb->Attachment[BUFFER_FRONT_LEFT].
+-		    Renderbuffer;
++		rrb =
++		    (void *) fb->Attachment[BUFFER_FRONT_LEFT].Renderbuffer;
+ 	} else if (fb->_ColorDrawBufferIndexes[0] == BUFFER_BACK_LEFT) {
+ 		/* draw to back */
+-		drb =
+-		    (driRenderbuffer *) fb->Attachment[BUFFER_BACK_LEFT].
+-		    Renderbuffer;
++		rrb = (void *) fb->Attachment[BUFFER_BACK_LEFT].Renderbuffer;
+ 	} else {
+ 		/* drawing to multiple buffers, or none */
+ 		return;
+ 	}
+ 
+-	assert(drb);
+-	assert(drb->flippedPitch);
++	assert(rrb);
++	assert(rrb->pitch);
+ 
+ 	R300_STATECHANGE(rmesa, cb);
+-
+-	r300->hw.cb.cmd[R300_CB_OFFSET] = drb->flippedOffset +	//r300->radeon.state.color.drawOffset +
+-	    r300->radeon.radeonScreen->fbLocation;
+-	r300->hw.cb.cmd[R300_CB_PITCH] = drb->flippedPitch;	//r300->radeon.state.color.drawPitch;
+-
+-	if (r300->radeon.radeonScreen->cpp == 4)
+-		r300->hw.cb.cmd[R300_CB_PITCH] |= R300_COLOR_FORMAT_ARGB8888;
+-	else
+-		r300->hw.cb.cmd[R300_CB_PITCH] |= R300_COLOR_FORMAT_RGB565;
+-
+-	if (r300->radeon.sarea->tiling_enabled)
+-		r300->hw.cb.cmd[R300_CB_PITCH] |= R300_COLOR_TILE_ENABLE;
+-#if 0
+-	R200_STATECHANGE(rmesa, ctx);
+-
+-	/* Note: we used the (possibly) page-flipped values */
+-	rmesa->hw.ctx.cmd[CTX_RB3D_COLOROFFSET]
+-	    = ((drb->flippedOffset + rmesa->r200Screen->fbLocation)
+-	       & R200_COLOROFFSET_MASK);
+-	rmesa->hw.ctx.cmd[CTX_RB3D_COLORPITCH] = drb->flippedPitch;
+-
+-	if (rmesa->sarea->tiling_enabled) {
+-		rmesa->hw.ctx.cmd[CTX_RB3D_COLORPITCH] |=
+-		    R200_COLOR_TILE_ENABLE;
+-	}
+-#endif
+ }
+ 
+ static void
+@@ -1412,7 +1374,8 @@ static void r300SetupFragmentShaderTextures(GLcontext *ctx, int *tmu_mappings)
+ 	}
+ 
+ 	r300->hw.fpt.cmd[R300_FPT_CMD_0] =
+-		cmdpacket0(R300_US_TEX_INST_0, code->tex.length);
++		cmdpacket0(r300->radeon.radeonScreen,
++                   R300_US_TEX_INST_0, code->tex.length);
+ }
+ 
+ static void r500SetupFragmentShaderTextures(GLcontext *ctx, int *tmu_mappings)
+@@ -1463,7 +1426,7 @@ static GLuint translate_lod_bias(GLfloat bias)
+ static void r300SetupTextures(GLcontext * ctx)
+ {
+ 	int i, mtu;
+-	struct r300_tex_obj *t;
++	struct radeon_tex_obj *t;
+ 	r300ContextPtr r300 = R300_CONTEXT(ctx);
+ 	int hw_tmu = 0;
+ 	int last_hw_tmu = -1;	/* -1 translates into no setup costs for fields */
+@@ -1497,21 +1460,16 @@ static void r300SetupTextures(GLcontext * ctx)
+ 	/* We cannot let disabled tmu offsets pass DRM */
+ 	for (i = 0; i < mtu; i++) {
+ 		if (ctx->Texture.Unit[i]._ReallyEnabled) {
+-
+-#if 0				/* Enables old behaviour */
+-			hw_tmu = i;
+-#endif
+ 			tmu_mappings[i] = hw_tmu;
+ 
+-			t = r300->state.texture.unit[i].texobj;
+-			/* XXX questionable fix for bug 9170: */
++			t = radeon_tex_obj(ctx->Texture.Unit[i]._Current);
+ 			if (!t)
+ 				continue;
+ 
+-			if ((t->format & 0xffffff00) == 0xffffff00) {
++			if ((t->pp_txformat & 0xffffff00) == 0xffffff00) {
+ 				WARN_ONCE
+ 				    ("unknown texture format (entry %x) encountered. Help me !\n",
+-				     t->format & 0xff);
++				     t->pp_txformat & 0xff);
+ 			}
+ 
+ 			if (RADEON_DEBUG & DEBUG_STATE)
+@@ -1522,29 +1480,28 @@ static void r300SetupTextures(GLcontext * ctx)
+ 
+ 			r300->hw.tex.filter.cmd[R300_TEX_VALUE_0 +
+ 						hw_tmu] =
+-			    gen_fixed_filter(t->filter) | (hw_tmu << 28);
++			    gen_fixed_filter(t->pp_txfilter) | (hw_tmu << 28);
+ 			/* Note: There is a LOD bias per texture unit and a LOD bias
+ 			 * per texture object. We add them here to get the correct behaviour.
+ 			 * (The per-texture object LOD bias was introduced in OpenGL 1.4
+ 			 * and is not present in the EXT_texture_object extension).
+ 			 */
+ 			r300->hw.tex.filter_1.cmd[R300_TEX_VALUE_0 + hw_tmu] =
+-				t->filter_1 |
+-				translate_lod_bias(ctx->Texture.Unit[i].LodBias + t->base.tObj->LodBias);
++				t->pp_txfilter_1 |
++				translate_lod_bias(ctx->Texture.Unit[i].LodBias + t->base.LodBias);
+ 			r300->hw.tex.size.cmd[R300_TEX_VALUE_0 + hw_tmu] =
+-			    t->size;
++			    t->pp_txsize;
+ 			r300->hw.tex.format.cmd[R300_TEX_VALUE_0 +
+-						hw_tmu] = t->format;
++						hw_tmu] = t->pp_txformat;
+ 			r300->hw.tex.pitch.cmd[R300_TEX_VALUE_0 + hw_tmu] =
+-			    t->pitch_reg;
+-			r300->hw.tex.offset.cmd[R300_TEX_VALUE_0 +
+-						hw_tmu] = t->offset;
++			  t->pp_txpitch;
++			r300->hw.textures[hw_tmu] = t;
+ 
+-			if (t->offset & R300_TXO_MACRO_TILE) {
++			if (t->tile_bits & R300_TXO_MACRO_TILE) {
+ 				WARN_ONCE("macro tiling enabled!\n");
+ 			}
+ 
+-			if (t->offset & R300_TXO_MICRO_TILE) {
++			if (t->tile_bits & R300_TXO_MICRO_TILE) {
+ 				WARN_ONCE("micro tiling enabled!\n");
+ 			}
+ 
+@@ -1561,21 +1518,21 @@ static void r300SetupTextures(GLcontext * ctx)
+ 	}
+ 
+ 	r300->hw.tex.filter.cmd[R300_TEX_CMD_0] =
+-	    cmdpacket0(R300_TX_FILTER0_0, last_hw_tmu + 1);
++	    cmdpacket0(r300->radeon.radeonScreen, R300_TX_FILTER0_0, last_hw_tmu + 1);
+ 	r300->hw.tex.filter_1.cmd[R300_TEX_CMD_0] =
+-	    cmdpacket0(R300_TX_FILTER1_0, last_hw_tmu + 1);
++	    cmdpacket0(r300->radeon.radeonScreen, R300_TX_FILTER1_0, last_hw_tmu + 1);
+ 	r300->hw.tex.size.cmd[R300_TEX_CMD_0] =
+-	    cmdpacket0(R300_TX_SIZE_0, last_hw_tmu + 1);
++	    cmdpacket0(r300->radeon.radeonScreen, R300_TX_SIZE_0, last_hw_tmu + 1);
+ 	r300->hw.tex.format.cmd[R300_TEX_CMD_0] =
+-	    cmdpacket0(R300_TX_FORMAT_0, last_hw_tmu + 1);
++	    cmdpacket0(r300->radeon.radeonScreen, R300_TX_FORMAT_0, last_hw_tmu + 1);
+ 	r300->hw.tex.pitch.cmd[R300_TEX_CMD_0] =
+-	    cmdpacket0(R300_TX_FORMAT2_0, last_hw_tmu + 1);
++	    cmdpacket0(r300->radeon.radeonScreen, R300_TX_FORMAT2_0, last_hw_tmu + 1);
+ 	r300->hw.tex.offset.cmd[R300_TEX_CMD_0] =
+-	    cmdpacket0(R300_TX_OFFSET_0, last_hw_tmu + 1);
++	    cmdpacket0(r300->radeon.radeonScreen, R300_TX_OFFSET_0, last_hw_tmu + 1);
+ 	r300->hw.tex.chroma_key.cmd[R300_TEX_CMD_0] =
+-	    cmdpacket0(R300_TX_CHROMA_KEY_0, last_hw_tmu + 1);
++	    cmdpacket0(r300->radeon.radeonScreen, R300_TX_CHROMA_KEY_0, last_hw_tmu + 1);
+ 	r300->hw.tex.border_color.cmd[R300_TEX_CMD_0] =
+-	    cmdpacket0(R300_TX_BORDER_COLOR_0, last_hw_tmu + 1);
++	    cmdpacket0(r300->radeon.radeonScreen, R300_TX_BORDER_COLOR_0, last_hw_tmu + 1);
+ 
+ 	if (!fp)		/* should only happenen once, just after context is created */
+ 		return;
+@@ -1587,7 +1544,7 @@ static void r300SetupTextures(GLcontext * ctx)
+ 			r300->hw.txe.cmd[R300_TXE_ENABLE] |= 1;
+ 			r300->hw.tex.filter.cmd[R300_TEX_VALUE_0] = 0;
+ 			r300->hw.tex.filter.cmd[R300_TEX_CMD_0] =
+-				cmdpacket0(R300_TX_FILTER0_0, 1);
++				cmdpacket0(r300->radeon.radeonScreen, R300_TX_FILTER0_0, 1);
+ 		}
+ 		r300SetupFragmentShaderTextures(ctx, tmu_mappings);
+ 	} else
+@@ -1756,7 +1713,7 @@ static void r300SetupRSUnit(GLcontext * ctx)
+ 	  | R300_HIRES_EN;
+ 
+ 	assert(high_rr >= 0);
+-	r300->hw.rr.cmd[R300_RR_CMD_0] = cmdpacket0(R300_RS_INST_0, high_rr + 1);
++	r300->hw.rr.cmd[R300_RR_CMD_0] = cmdpacket0(r300->radeon.radeonScreen, R300_RS_INST_0, high_rr + 1);
+ 	r300->hw.rc.cmd[2] = high_rr;
+ 
+ 	if (InputsRead)
+@@ -1916,7 +1873,7 @@ static void r500SetupRSUnit(GLcontext * ctx)
+ 	  | R300_HIRES_EN;
+ 
+ 	assert(high_rr >= 0);
+-	r300->hw.rr.cmd[R300_RR_CMD_0] = cmdpacket0(R500_RS_INST_0, high_rr + 1);
++	r300->hw.rr.cmd[R300_RR_CMD_0] = cmdpacket0(r300->radeon.radeonScreen, R500_RS_INST_0, high_rr + 1);
+ 	r300->hw.rc.cmd[2] = 0xC0 | high_rr;
+ 
+ 	if (InputsRead)
+@@ -2114,6 +2071,7 @@ static void r300SetupRealVertexProgram(r300ContextPtr rmesa)
+ 	  (inst_count << R300_PVS_LAST_VTX_SRC_INST_SHIFT);
+ }
+ 
++
+ static void r300SetupVertexProgram(r300ContextPtr rmesa)
+ {
+ 	GLcontext *ctx = rmesa->radeon.glCtx;
+@@ -2143,6 +2101,7 @@ static void r300SetupVertexProgram(r300ContextPtr rmesa)
+  */
+ static void r300Enable(GLcontext * ctx, GLenum cap, GLboolean state)
+ {
++	r300ContextPtr rmesa = R300_CONTEXT(ctx);
+ 	if (RADEON_DEBUG & DEBUG_STATE)
+ 		fprintf(stderr, "%s( %s = %s )\n", __FUNCTION__,
+ 			_mesa_lookup_enum_by_nr(cap),
+@@ -2188,8 +2147,12 @@ static void r300Enable(GLcontext * ctx, GLenum cap, GLboolean state)
+ 	case GL_POLYGON_OFFSET_FILL:
+ 		r300SetPolygonOffsetState(ctx, state);
+ 		break;
++	case GL_SCISSOR_TEST:
++		radeon_firevertices(&rmesa->radeon);
++		rmesa->radeon.state.scissor.enabled = state;
++		radeonUpdateScissor( ctx );
++		break;
+ 	default:
+-		radeonEnable(ctx, cap, state);
+ 		break;
+ 	}
+ }
+@@ -2200,6 +2163,7 @@ static void r300Enable(GLcontext * ctx, GLenum cap, GLboolean state)
+ static void r300ResetHwState(r300ContextPtr r300)
+ {
+ 	GLcontext *ctx = r300->radeon.glCtx;
++	struct radeon_renderbuffer *rrb;
+ 	int has_tcl = 1;
+ 
+ 	if (!(r300->radeon.radeonScreen->chip_flags & RADEON_CHIPSET_TCL))
+@@ -2230,8 +2194,6 @@ static void r300ResetHwState(r300ContextPtr r300)
+ 
+ 	r300UpdateCulling(ctx);
+ 
+-	r300UpdateTextureState(ctx);
+-
+ 	r300SetBlendState(ctx);
+ 	r300SetLogicOpState(ctx);
+ 
+@@ -2378,20 +2340,6 @@ static void r300ResetHwState(r300ContextPtr r300)
+ 
+ 	r300BlendColor(ctx, ctx->Color.BlendColor);
+ 
+-	/* Again, r300ClearBuffer uses this */
+-	r300->hw.cb.cmd[R300_CB_OFFSET] =
+-	    r300->radeon.state.color.drawOffset +
+-	    r300->radeon.radeonScreen->fbLocation;
+-	r300->hw.cb.cmd[R300_CB_PITCH] = r300->radeon.state.color.drawPitch;
+-
+-	if (r300->radeon.radeonScreen->cpp == 4)
+-		r300->hw.cb.cmd[R300_CB_PITCH] |= R300_COLOR_FORMAT_ARGB8888;
+-	else
+-		r300->hw.cb.cmd[R300_CB_PITCH] |= R300_COLOR_FORMAT_RGB565;
+-
+-	if (r300->radeon.sarea->tiling_enabled)
+-		r300->hw.cb.cmd[R300_CB_PITCH] |= R300_COLOR_TILE_ENABLE;
+-
+ 	r300->hw.rb3d_dither_ctl.cmd[1] = 0;
+ 	r300->hw.rb3d_dither_ctl.cmd[2] = 0;
+ 	r300->hw.rb3d_dither_ctl.cmd[3] = 0;
+@@ -2407,12 +2355,8 @@ static void r300ResetHwState(r300ContextPtr r300)
+ 	r300->hw.rb3d_discard_src_pixel_lte_threshold.cmd[1] = 0x00000000;
+ 	r300->hw.rb3d_discard_src_pixel_lte_threshold.cmd[2] = 0xffffffff;
+ 
+-	r300->hw.zb.cmd[R300_ZB_OFFSET] =
+-	    r300->radeon.radeonScreen->depthOffset +
+-	    r300->radeon.radeonScreen->fbLocation;
+-	r300->hw.zb.cmd[R300_ZB_PITCH] = r300->radeon.radeonScreen->depthPitch;
+-
+-	if (r300->radeon.sarea->tiling_enabled) {
++	rrb = r300->radeon.state.depth.rrb;
++	if (rrb && rrb->bo && (rrb->bo->flags & RADEON_BO_FLAGS_MACRO_TILE)) {
+ 		/* XXX: Turn off when clearing buffers ? */
+ 		r300->hw.zb.cmd[R300_ZB_PITCH] |= R300_DEPTHMACROTILE_ENABLE;
+ 
+@@ -2423,18 +2367,6 @@ static void r300ResetHwState(r300ContextPtr r300)
+ 
+ 	r300->hw.zb_depthclearvalue.cmd[1] = 0;
+ 
+-	switch (ctx->Visual.depthBits) {
+-	case 16:
+-		r300->hw.zstencil_format.cmd[1] = R300_DEPTHFORMAT_16BIT_INT_Z;
+-		break;
+-	case 24:
+-		r300->hw.zstencil_format.cmd[1] = R300_DEPTHFORMAT_24BIT_INT_Z_8BIT_STENCIL;
+-		break;
+-	default:
+-		fprintf(stderr, "Error: Unsupported depth %d... exiting\n", ctx->Visual.depthBits);
+-		_mesa_exit(-1);
+-	}
+-
+ 	r300->hw.zstencil_format.cmd[2] = R300_ZTOP_DISABLE;
+ 	r300->hw.zstencil_format.cmd[3] = 0x00000003;
+ 	r300->hw.zstencil_format.cmd[4] = 0x00000000;
+@@ -2455,7 +2387,7 @@ static void r300ResetHwState(r300ContextPtr r300)
+ 		r300->hw.vps.cmd[R300_VPS_ZERO_3] = 0;
+ 	}
+ 
+-	r300->hw.all_dirty = GL_TRUE;
++	r300->radeon.hw.all_dirty = GL_TRUE;
+ }
+ 
+ void r300UpdateShaders(r300ContextPtr rmesa)
+@@ -2466,8 +2398,8 @@ void r300UpdateShaders(r300ContextPtr rmesa)
+ 
+ 	ctx = rmesa->radeon.glCtx;
+ 
+-	if (rmesa->NewGLState && hw_tcl_on) {
+-		rmesa->NewGLState = 0;
++	if (rmesa->radeon.NewGLState && hw_tcl_on) {
++		rmesa->radeon.NewGLState = 0;
+ 
+ 		for (i = _TNL_FIRST_MAT; i <= _TNL_LAST_MAT; i++) {
+ 			rmesa->temp_attrib[i] =
+@@ -2546,10 +2478,10 @@ static void r300SetupPixelShader(r300ContextPtr rmesa)
+ 	R300_STATECHANGE(rmesa, fpi[1]);
+ 	R300_STATECHANGE(rmesa, fpi[2]);
+ 	R300_STATECHANGE(rmesa, fpi[3]);
+-	rmesa->hw.fpi[0].cmd[R300_FPI_CMD_0] = cmdpacket0(R300_US_ALU_RGB_INST_0, code->alu.length);
+-	rmesa->hw.fpi[1].cmd[R300_FPI_CMD_0] = cmdpacket0(R300_US_ALU_RGB_ADDR_0, code->alu.length);
+-	rmesa->hw.fpi[2].cmd[R300_FPI_CMD_0] = cmdpacket0(R300_US_ALU_ALPHA_INST_0, code->alu.length);
+-	rmesa->hw.fpi[3].cmd[R300_FPI_CMD_0] = cmdpacket0(R300_US_ALU_ALPHA_ADDR_0, code->alu.length);
++	rmesa->hw.fpi[0].cmd[R300_FPI_CMD_0] = cmdpacket0(rmesa->radeon.radeonScreen, R300_US_ALU_RGB_INST_0, code->alu.length);
++	rmesa->hw.fpi[1].cmd[R300_FPI_CMD_0] = cmdpacket0(rmesa->radeon.radeonScreen, R300_US_ALU_RGB_ADDR_0, code->alu.length);
++	rmesa->hw.fpi[2].cmd[R300_FPI_CMD_0] = cmdpacket0(rmesa->radeon.radeonScreen, R300_US_ALU_ALPHA_INST_0, code->alu.length);
++	rmesa->hw.fpi[3].cmd[R300_FPI_CMD_0] = cmdpacket0(rmesa->radeon.radeonScreen, R300_US_ALU_ALPHA_ADDR_0, code->alu.length);
+ 	for (i = 0; i < code->alu.length; i++) {
+ 		rmesa->hw.fpi[0].cmd[R300_FPI_INSTR_0 + i] = code->alu.inst[i].inst0;
+ 		rmesa->hw.fpi[1].cmd[R300_FPI_INSTR_0 + i] = code->alu.inst[i].inst1;
+@@ -2580,7 +2512,7 @@ static void r300SetupPixelShader(r300ContextPtr rmesa)
+ 	}
+ 
+ 	R300_STATECHANGE(rmesa, fpp);
+-	rmesa->hw.fpp.cmd[R300_FPP_CMD_0] = cmdpacket0(R300_PFS_PARAM_0_X, code->const_nr * 4);
++	rmesa->hw.fpp.cmd[R300_FPP_CMD_0] = cmdpacket0(rmesa->radeon.radeonScreen, R300_PFS_PARAM_0_X, code->const_nr * 4);
+ 	for (i = 0; i < code->const_nr; i++) {
+ 		const GLfloat *constant = get_fragmentprogram_constant(ctx,
+ 			&fp->mesa_program.Base, code->constant[i]);
+@@ -2682,7 +2614,6 @@ void r300UpdateShaderStates(r300ContextPtr rmesa)
+ 	GLcontext *ctx;
+ 	ctx = rmesa->radeon.glCtx;
+ 
+-	r300UpdateTextureState(ctx);
+ 	r300SetEarlyZState(ctx);
+ 
+ 	GLuint fgdepthsrc = R300_FG_DEPTH_SRC_SCAN;
+@@ -2727,7 +2658,7 @@ static void r300InvalidateState(GLcontext * ctx, GLuint new_state)
+ 
+ 	r300UpdateStateParameters(ctx, new_state);
+ 
+-	r300->NewGLState |= new_state;
++	r300->radeon.NewGLState |= new_state;
+ }
+ 
+ /**
+@@ -2740,26 +2671,9 @@ void r300InitState(r300ContextPtr r300)
+ 	GLcontext *ctx = r300->radeon.glCtx;
+ 	GLuint depth_fmt;
+ 
+-	radeonInitState(&r300->radeon);
+-
+-	switch (ctx->Visual.depthBits) {
+-	case 16:
+-		r300->state.depth.scale = 1.0 / (GLfloat) 0xffff;
+-		depth_fmt = R300_DEPTHFORMAT_16BIT_INT_Z;
+-		break;
+-	case 24:
+-		r300->state.depth.scale = 1.0 / (GLfloat) 0xffffff;
+-		depth_fmt = R300_DEPTHFORMAT_24BIT_INT_Z_8BIT_STENCIL;
+-		break;
+-	default:
+-		fprintf(stderr, "Error: Unsupported depth %d... exiting\n",
+-			ctx->Visual.depthBits);
+-		_mesa_exit(-1);
+-	}
+-
+ 	/* Only have hw stencil when depth buffer is 24 bits deep */
+-	r300->state.stencil.hw_stencil = (ctx->Visual.stencilBits > 0 &&
+-					  ctx->Visual.depthBits == 24);
++	r300->radeon.state.stencil.hwBuffer = (ctx->Visual.stencilBits > 0 &&
++					       ctx->Visual.depthBits == 24);
+ 
+ 	memset(&(r300->state.texture), 0, sizeof(r300->state.texture));
+ 
+@@ -2791,12 +2705,33 @@ void r300UpdateClipPlanes( GLcontext *ctx )
+ 	}
+ }
+ 
++static void r300DrawBuffer( GLcontext *ctx, GLenum mode )
++{
++	r300ContextPtr rmesa = R300_CONTEXT(ctx);
++	if (RADEON_DEBUG & DEBUG_DRI)
++		fprintf(stderr, "%s %s\n", __FUNCTION__,
++			_mesa_lookup_enum_by_nr( mode ));
++
++	radeon_firevertices(&rmesa->radeon);	/* don't pipeline cliprect changes */
++
++	radeonSetCliprects( &rmesa->radeon );
++        if (!rmesa->radeon.radeonScreen->driScreen->dri2.enabled) 
++		radeonUpdatePageFlipping(&rmesa->radeon);
++}
++
++static void r300ReadBuffer( GLcontext *ctx, GLenum mode )
++{
++	if (RADEON_DEBUG & DEBUG_DRI)
++		fprintf(stderr, "%s %s\n", __FUNCTION__,
++			_mesa_lookup_enum_by_nr( mode ));
++
++};
++
+ /**
+  * Initialize driver's state callback functions
+  */
+ void r300InitStateFuncs(struct dd_function_table *functions)
+ {
+-	radeonInitStateFuncs(functions);
+ 
+ 	functions->UpdateState = r300InvalidateState;
+ 	functions->AlphaFunc = r300AlphaFunc;
+@@ -2833,4 +2768,8 @@ void r300InitStateFuncs(struct dd_function_table *functions)
+ 	functions->RenderMode = r300RenderMode;
+ 
+ 	functions->ClipPlane = r300ClipPlane;
++	functions->Scissor = radeonScissor;
++
++	functions->DrawBuffer		= r300DrawBuffer;
++	functions->ReadBuffer		= r300ReadBuffer;
+ }
+diff --git a/src/mesa/drivers/dri/r300/r300_state.h b/src/mesa/drivers/dri/r300/r300_state.h
+index 0589ab7..247a20e 100644
+--- a/src/mesa/drivers/dri/r300/r300_state.h
++++ b/src/mesa/drivers/dri/r300/r300_state.h
+@@ -39,32 +39,17 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ #define R300_NEWPRIM( rmesa )			\
+   do {						\
+-    if ( rmesa->dma.flush )			\
+-      rmesa->dma.flush( rmesa );		\
++  if ( rmesa->radeon.dma.flush )			\
++    rmesa->radeon.dma.flush( rmesa->radeon.glCtx );	\
+   } while (0)
+ 
+ #define R300_STATECHANGE(r300, atom) \
+ 	do {						\
+ 	  R300_NEWPRIM(r300);				\
+ 		r300->hw.atom.dirty = GL_TRUE;		\
+-		r300->hw.is_dirty = GL_TRUE;		\
++		r300->radeon.hw.is_dirty = GL_TRUE;		\
+ 	} while(0)
+ 
+-#define R300_PRINT_STATE(r300, atom) \
+-		r300PrintStateAtom(r300, &r300->hw.atom)
+-
+-/* Fire the buffered vertices no matter what.
+-   TODO: This has not been implemented yet
+- */
+-#define R300_FIREVERTICES( r300 )			\
+-do {							\
+-    \
+-   if ( (r300)->cmdbuf.count_used || (r300)->dma.flush ) {	\
+-      r300Flush( (r300)->radeon.glCtx );		\
+-   }							\
+-    \
+-} while (0)
+-
+ // r300_state.c
+ extern int future_hw_tcl_on;
+ void _tnl_UpdateFixedFunctionProgram (GLcontext * ctx);
+diff --git a/src/mesa/drivers/dri/r300/r300_swtcl.c b/src/mesa/drivers/dri/r300/r300_swtcl.c
+index b6e7ce1..0d8b7e5 100644
+--- a/src/mesa/drivers/dri/r300/r300_swtcl.c
++++ b/src/mesa/drivers/dri/r300/r300_swtcl.c
+@@ -56,26 +56,23 @@ USE OR OTHER DEALINGS IN THE SOFTWARE.
+ #include "r300_state.h"
+ #include "r300_ioctl.h"
+ #include "r300_emit.h"
+-#include "r300_mem.h"
++#include "r300_tex.h"
+ 
+-static void flush_last_swtcl_prim( r300ContextPtr rmesa  );
+-
+-
+-void r300EmitVertexAOS(r300ContextPtr rmesa, GLuint vertex_size, GLuint offset);
++void r300EmitVertexAOS(r300ContextPtr rmesa, GLuint vertex_size, struct radeon_bo *bo, GLuint offset);
+ void r300EmitVbufPrim(r300ContextPtr rmesa, GLuint primitive, GLuint vertex_nr);
+ #define EMIT_ATTR( ATTR, STYLE )					\
+ do {									\
+-   rmesa->swtcl.vertex_attrs[rmesa->swtcl.vertex_attr_count].attrib = (ATTR);	\
+-   rmesa->swtcl.vertex_attrs[rmesa->swtcl.vertex_attr_count].format = (STYLE);	\
+-   rmesa->swtcl.vertex_attr_count++;					\
++   rmesa->radeon.swtcl.vertex_attrs[rmesa->radeon.swtcl.vertex_attr_count].attrib = (ATTR);	\
++   rmesa->radeon.swtcl.vertex_attrs[rmesa->radeon.swtcl.vertex_attr_count].format = (STYLE);	\
++   rmesa->radeon.swtcl.vertex_attr_count++;					\
+ } while (0)
+ 
+ #define EMIT_PAD( N )							\
+ do {									\
+-   rmesa->swtcl.vertex_attrs[rmesa->swtcl.vertex_attr_count].attrib = 0;		\
+-   rmesa->swtcl.vertex_attrs[rmesa->swtcl.vertex_attr_count].format = EMIT_PAD;	\
+-   rmesa->swtcl.vertex_attrs[rmesa->swtcl.vertex_attr_count].offset = (N);		\
+-   rmesa->swtcl.vertex_attr_count++;					\
++   rmesa->radeon.swtcl.vertex_attrs[rmesa->radeon.swtcl.vertex_attr_count].attrib = 0;		\
++   rmesa->radeon.swtcl.vertex_attrs[rmesa->radeon.swtcl.vertex_attr_count].format = EMIT_PAD;	\
++   rmesa->radeon.swtcl.vertex_attrs[rmesa->radeon.swtcl.vertex_attr_count].offset = (N);		\
++   rmesa->radeon.swtcl.vertex_attr_count++;					\
+ } while (0)
+ 
+ static void r300SetVertexFormat( GLcontext *ctx )
+@@ -86,7 +83,6 @@ static void r300SetVertexFormat( GLcontext *ctx )
+ 	DECLARE_RENDERINPUTS(index_bitset);
+ 	GLuint InputsRead = 0, OutputsWritten = 0;
+ 	int vap_fmt_0 = 0;
+-	int vap_vte_cntl = 0;
+ 	int offset = 0;
+ 	int vte = 0;
+ 	GLint inputs[VERT_ATTRIB_MAX];
+@@ -114,7 +110,7 @@ static void r300SetVertexFormat( GLcontext *ctx )
+ 	}
+ 
+ 	assert( VB->AttribPtr[VERT_ATTRIB_POS] != NULL );
+-	rmesa->swtcl.vertex_attr_count = 0;
++	rmesa->radeon.swtcl.vertex_attr_count = 0;
+ 
+ 	/* EMIT_ATTR's must be in order as they tell t_vertex.c how to
+ 	 * build up a hardware vertex.
+@@ -175,7 +171,7 @@ static void r300SetVertexFormat( GLcontext *ctx )
+ 			inputs[i] = -1;
+ 		}
+ 	}
+-	
++
+ 	/* Fixed, apply to vir0 only */
+ 	if (InputsRead & (1 << VERT_ATTRIB_POS))
+ 		inputs[VERT_ATTRIB_POS] = 0;
+@@ -186,16 +182,16 @@ static void r300SetVertexFormat( GLcontext *ctx )
+ 	for (i = VERT_ATTRIB_TEX0; i <= VERT_ATTRIB_TEX7; i++)
+ 		if (InputsRead & (1 << i))
+ 			inputs[i] = 6 + (i - VERT_ATTRIB_TEX0);
+-	
++
+ 	for (i = 0, nr = 0; i < VERT_ATTRIB_MAX; i++) {
+ 		if (InputsRead & (1 << i)) {
+ 			tab[nr++] = i;
+ 		}
+ 	}
+-	
++
+ 	for (i = 0; i < nr; i++) {
+ 		int ci;
+-		
++
+ 		swizzle[i][0] = SWIZZLE_ZERO;
+ 		swizzle[i][1] = SWIZZLE_ZERO;
+ 		swizzle[i][2] = SWIZZLE_ZERO;
+@@ -215,98 +211,29 @@ static void r300SetVertexFormat( GLcontext *ctx )
+ 	((drm_r300_cmd_header_t *) rmesa->hw.vir[1].cmd)->packet0.count =
+ 		r300VAPInputRoute1(&rmesa->hw.vir[1].cmd[R300_VIR_CNTL_0], swizzle,
+ 				   nr);
+-   
++
+ 	R300_STATECHANGE(rmesa, vic);
+ 	rmesa->hw.vic.cmd[R300_VIC_CNTL_0] = r300VAPInputCntl0(ctx, InputsRead);
+ 	rmesa->hw.vic.cmd[R300_VIC_CNTL_1] = r300VAPInputCntl1(ctx, InputsRead);
+-   
++
+ 	R300_STATECHANGE(rmesa, vof);
+ 	rmesa->hw.vof.cmd[R300_VOF_CNTL_0] = r300VAPOutputCntl0(ctx, OutputsWritten);
+ 	rmesa->hw.vof.cmd[R300_VOF_CNTL_1] = vap_fmt_1;
+-   
+-	rmesa->swtcl.vertex_size =
++
++	rmesa->radeon.swtcl.vertex_size =
+ 		_tnl_install_attrs( ctx,
+-				    rmesa->swtcl.vertex_attrs, 
+-				    rmesa->swtcl.vertex_attr_count,
++				    rmesa->radeon.swtcl.vertex_attrs,
++				    rmesa->radeon.swtcl.vertex_attr_count,
+ 				    NULL, 0 );
+-	
+-	rmesa->swtcl.vertex_size /= 4;
++
++	rmesa->radeon.swtcl.vertex_size /= 4;
+ 
+ 	RENDERINPUTS_COPY( rmesa->tnl_index_bitset, index_bitset );
+ 
+ 
+ 	R300_STATECHANGE(rmesa, vte);
+ 	rmesa->hw.vte.cmd[1] = vte;
+-	rmesa->hw.vte.cmd[2] = rmesa->swtcl.vertex_size;
+-}
+-
+-
+-/* Flush vertices in the current dma region.
+- */
+-static void flush_last_swtcl_prim( r300ContextPtr rmesa  )
+-{
+-	if (RADEON_DEBUG & DEBUG_IOCTL)
+-		fprintf(stderr, "%s\n", __FUNCTION__);
+-	
+-	rmesa->dma.flush = NULL;
+-
+-	if (rmesa->dma.current.buf) {
+-		struct r300_dma_region *current = &rmesa->dma.current;
+-		GLuint current_offset = GET_START(current);
+-
+-		assert (current->start + 
+-			rmesa->swtcl.numverts * rmesa->swtcl.vertex_size * 4 ==
+-			current->ptr);
+-
+-		if (rmesa->dma.current.start != rmesa->dma.current.ptr) {
+-
+-			r300EnsureCmdBufSpace( rmesa, rmesa->hw.max_state_size + (12*sizeof(int)), __FUNCTION__);
+-			
+-			r300EmitState(rmesa);
+-			
+-			r300EmitVertexAOS( rmesa,
+-					   rmesa->swtcl.vertex_size,
+-					   current_offset);
+-			
+-			r300EmitVbufPrim( rmesa,
+-					  rmesa->swtcl.hw_primitive,
+-					  rmesa->swtcl.numverts);
+-			
+-			r300EmitCacheFlush(rmesa);
+-		}
+-		
+-		rmesa->swtcl.numverts = 0;
+-		current->start = current->ptr;
+-	}
+-}
+-
+-/* Alloc space in the current dma region.
+- */
+-static void *
+-r300AllocDmaLowVerts( r300ContextPtr rmesa, int nverts, int vsize )
+-{
+-	GLuint bytes = vsize * nverts;
+-
+-	if ( rmesa->dma.current.ptr + bytes > rmesa->dma.current.end ) 
+-		r300RefillCurrentDmaRegion( rmesa, bytes);
+-
+-	if (!rmesa->dma.flush) {
+-		rmesa->radeon.glCtx->Driver.NeedFlush |= FLUSH_STORED_VERTICES;
+-		rmesa->dma.flush = flush_last_swtcl_prim;
+-	}
+-
+-	ASSERT( vsize == rmesa->swtcl.vertex_size * 4 );
+-	ASSERT( rmesa->dma.flush == flush_last_swtcl_prim );
+-	ASSERT( rmesa->dma.current.start + 
+-		rmesa->swtcl.numverts * rmesa->swtcl.vertex_size * 4 ==
+-		rmesa->dma.current.ptr );
+-
+-	{
+-		GLubyte *head = (GLubyte *) (rmesa->dma.current.address + rmesa->dma.current.ptr);
+-		rmesa->dma.current.ptr += bytes;
+-		rmesa->swtcl.numverts += nverts;
+-		return head;
+-	}
++	rmesa->hw.vte.cmd[2] = rmesa->radeon.swtcl.vertex_size;
+ }
+ 
+ static GLuint reduced_prim[] = {
+@@ -346,13 +273,13 @@ static void r300RenderPrimitive( GLcontext *ctx, GLenum prim );
+ #undef LOCAL_VARS
+ #undef ALLOC_VERTS
+ #define CTX_ARG r300ContextPtr rmesa
+-#define GET_VERTEX_DWORDS() rmesa->swtcl.vertex_size
+-#define ALLOC_VERTS( n, size ) r300AllocDmaLowVerts( rmesa, n, size * 4 )
++#define GET_VERTEX_DWORDS() rmesa->radeon.swtcl.vertex_size
++#define ALLOC_VERTS( n, size ) rcommonAllocDmaLowVerts( &rmesa->radeon, n, size * 4 )
+ #define LOCAL_VARS						\
+    r300ContextPtr rmesa = R300_CONTEXT(ctx);		\
+-   const char *r300verts = (char *)rmesa->swtcl.verts;
++   const char *r300verts = (char *)rmesa->radeon.swtcl.verts;
+ #define VERT(x) (r300Vertex *)(r300verts + ((x) * vertsize * sizeof(int)))
+-#define VERTEX r300Vertex 
++#define VERTEX r300Vertex
+ #define DO_DEBUG_VERTS (1 && (RADEON_DEBUG & DEBUG_VERTS))
+ #define PRINT_VERTEX(x)
+ #undef TAG
+@@ -409,7 +336,7 @@ static struct {
+ #define VERT_Y(_v) _v->v.y
+ #define VERT_Z(_v) _v->v.z
+ #define AREA_IS_CCW( a ) (a < 0)
+-#define GET_VERTEX(e) (rmesa->swtcl.verts + (e*rmesa->swtcl.vertex_size*sizeof(int)))
++#define GET_VERTEX(e) (rmesa->radeon.swtcl.verts + (e*rmesa->radeon.swtcl.vertex_size*sizeof(int)))
+ 
+ /* Only used to pull back colors into vertices (ie, we know color is
+  * floating point).
+@@ -455,7 +382,7 @@ do {							\
+  ***********************************************************************/
+ 
+ #define RASTERIZE(x) r300RasterPrimitive( ctx, reduced_prim[x] )
+-#define RENDER_PRIMITIVE rmesa->swtcl.render_primitive
++#define RENDER_PRIMITIVE rmesa->radeon.swtcl.render_primitive
+ #undef TAG
+ #define TAG(x) x
+ #include "tnl_dd/t_dd_unfilled.h"
+@@ -512,8 +439,8 @@ static void init_rast_tab( void )
+ #undef LOCAL_VARS
+ #define LOCAL_VARS						\
+    r300ContextPtr rmesa = R300_CONTEXT(ctx);		\
+-   const GLuint vertsize = rmesa->swtcl.vertex_size;		\
+-   const char *r300verts = (char *)rmesa->swtcl.verts;		\
++   const GLuint vertsize = rmesa->radeon.swtcl.vertex_size;		\
++   const char *r300verts = (char *)rmesa->radeon.swtcl.verts;		\
+    const GLuint * const elt = TNL_CONTEXT(ctx)->vb.Elts;	\
+    const GLboolean stipple = ctx->Line.StippleFlag;		\
+    (void) elt; (void) stipple;
+@@ -545,7 +472,7 @@ static void r300ChooseRenderState( GLcontext *ctx )
+ 	if (flags & DD_TRI_LIGHT_TWOSIDE) index |= R300_TWOSIDE_BIT;
+ 	if (flags & DD_TRI_UNFILLED)      index |= R300_UNFILLED_BIT;
+ 
+-	if (index != rmesa->swtcl.RenderIndex) {
++	if (index != rmesa->radeon.swtcl.RenderIndex) {
+ 		tnl->Driver.Render.Points = rast_tab[index].points;
+ 		tnl->Driver.Render.Line = rast_tab[index].line;
+ 		tnl->Driver.Render.ClippedLine = rast_tab[index].line;
+@@ -562,7 +489,7 @@ static void r300ChooseRenderState( GLcontext *ctx )
+ 			tnl->Driver.Render.ClippedPolygon = _tnl_RenderClippedPolygon;
+ 		}
+ 
+-		rmesa->swtcl.RenderIndex = index;
++		rmesa->radeon.swtcl.RenderIndex = index;
+ 	}
+ }
+ 
+@@ -572,18 +499,18 @@ static void r300RenderStart(GLcontext *ctx)
+         r300ContextPtr rmesa = R300_CONTEXT( ctx );
+ 	//	fprintf(stderr, "%s\n", __FUNCTION__);
+ 
+-	r300ChooseRenderState(ctx);	
++	r300ChooseRenderState(ctx);
+ 	r300SetVertexFormat(ctx);
+ 
++	r300ValidateBuffers(ctx);
++
+ 	r300UpdateShaders(rmesa);
+ 	r300UpdateShaderStates(rmesa);
+ 
+ 	r300EmitCacheFlush(rmesa);
+-	
+-	if (rmesa->dma.flush != 0 && 
+-	    rmesa->dma.flush != flush_last_swtcl_prim)
+-		rmesa->dma.flush( rmesa );
+-
++	if (rmesa->radeon.dma.flush != NULL) {
++		rmesa->radeon.dma.flush(ctx);
++	}
+ }
+ 
+ static void r300RenderFinish(GLcontext *ctx)
+@@ -593,10 +520,10 @@ static void r300RenderFinish(GLcontext *ctx)
+ static void r300RasterPrimitive( GLcontext *ctx, GLuint hwprim )
+ {
+ 	r300ContextPtr rmesa = R300_CONTEXT(ctx);
+-	
+-	if (rmesa->swtcl.hw_primitive != hwprim) {
++
++	if (rmesa->radeon.swtcl.hw_primitive != hwprim) {
+ 	        R300_NEWPRIM( rmesa );
+-		rmesa->swtcl.hw_primitive = hwprim;
++		rmesa->radeon.swtcl.hw_primitive = hwprim;
+ 	}
+ }
+ 
+@@ -604,14 +531,14 @@ static void r300RenderPrimitive(GLcontext *ctx, GLenum prim)
+ {
+ 
+ 	r300ContextPtr rmesa = R300_CONTEXT(ctx);
+-	rmesa->swtcl.render_primitive = prim;
++	rmesa->radeon.swtcl.render_primitive = prim;
+ 
+ 	if ((prim == GL_TRIANGLES) && (ctx->_TriangleCaps & DD_TRI_UNFILLED))
+ 	  return;
+ 
+ 	r300RasterPrimitive( ctx, reduced_prim[prim] );
+ 	//	fprintf(stderr, "%s\n", __FUNCTION__);
+-	
++
+ }
+ 
+ static void r300ResetLineStipple(GLcontext *ctx)
+@@ -625,12 +552,12 @@ void r300InitSwtcl(GLcontext *ctx)
+ 	TNLcontext *tnl = TNL_CONTEXT(ctx);
+ 	r300ContextPtr rmesa = R300_CONTEXT(ctx);
+ 	static int firsttime = 1;
+-	
++
+ 	if (firsttime) {
+ 		init_rast_tab();
+ 		firsttime = 0;
+ 	}
+-	
++
+ 	tnl->Driver.Render.Start = r300RenderStart;
+ 	tnl->Driver.Render.Finish = r300RenderFinish;
+ 	tnl->Driver.Render.PrimitiveNotify = r300RenderPrimitive;
+@@ -638,15 +565,15 @@ void r300InitSwtcl(GLcontext *ctx)
+ 	tnl->Driver.Render.BuildVertices = _tnl_build_vertices;
+ 	tnl->Driver.Render.CopyPV = _tnl_copy_pv;
+ 	tnl->Driver.Render.Interp = _tnl_interp;
+-	
++
+ 	/* FIXME: what are these numbers? */
+-	_tnl_init_vertices( ctx, ctx->Const.MaxArrayLockSize + 12, 
++	_tnl_init_vertices( ctx, ctx->Const.MaxArrayLockSize + 12,
+ 			    48 * sizeof(GLfloat) );
+-	
+-	rmesa->swtcl.verts = (GLubyte *)tnl->clipspace.vertex_buf;
+-	rmesa->swtcl.RenderIndex = ~0;
+-	rmesa->swtcl.render_primitive = GL_TRIANGLES;
+-	rmesa->swtcl.hw_primitive = 0;	
++
++	rmesa->radeon.swtcl.verts = (GLubyte *)tnl->clipspace.vertex_buf;
++	rmesa->radeon.swtcl.RenderIndex = ~0;
++	rmesa->radeon.swtcl.render_primitive = GL_TRIANGLES;
++	rmesa->radeon.swtcl.hw_primitive = 0;
+ 
+ 	_tnl_invalidate_vertex_state( ctx, ~0 );
+ 	_tnl_invalidate_vertices( ctx, ~0 );
+@@ -655,9 +582,9 @@ void r300InitSwtcl(GLcontext *ctx)
+ 	_tnl_need_projected_coords( ctx, GL_FALSE );
+ 	r300ChooseRenderState(ctx);
+ 
+-	_mesa_validate_all_lighting_tables( ctx ); 
++	_mesa_validate_all_lighting_tables( ctx );
+ 
+-	tnl->Driver.NotifyMaterialChange = 
++	tnl->Driver.NotifyMaterialChange =
+ 	  _mesa_validate_all_lighting_tables;
+ }
+ 
+@@ -665,33 +592,53 @@ void r300DestroySwtcl(GLcontext *ctx)
+ {
+ }
+ 
+-void r300EmitVertexAOS(r300ContextPtr rmesa, GLuint vertex_size, GLuint offset)
++void r300EmitVertexAOS(r300ContextPtr rmesa, GLuint vertex_size, struct radeon_bo *bo, GLuint offset)
+ {
+-	int cmd_reserved = 0;
+-	int cmd_written = 0;
++	BATCH_LOCALS(&rmesa->radeon);
+ 
+-	drm_radeon_cmd_header_t *cmd = NULL;
+ 	if (RADEON_DEBUG & DEBUG_VERTS)
+-	  fprintf(stderr, "%s:  vertex_size %d, offset 0x%x \n",
+-		  __FUNCTION__, vertex_size, offset);
+-
+-	start_packet3(CP_PACKET3(R300_PACKET3_3D_LOAD_VBPNTR, 2), 2);
+-	e32(1);
+-	e32(vertex_size | (vertex_size << 8));
+-	e32(offset);
++		fprintf(stderr, "%s:  vertex_size %d, offset 0x%x \n",
++			__FUNCTION__, vertex_size, offset);
++
++	BEGIN_BATCH(7);
++	OUT_BATCH_PACKET3(R300_PACKET3_3D_LOAD_VBPNTR, 2);
++	OUT_BATCH(1);
++	OUT_BATCH(vertex_size | (vertex_size << 8));
++	OUT_BATCH_RELOC(offset, bo, offset, RADEON_GEM_DOMAIN_GTT, 0, 0);
++	END_BATCH();
+ }
+ 
+ void r300EmitVbufPrim(r300ContextPtr rmesa, GLuint primitive, GLuint vertex_nr)
+ {
+-
+-	int cmd_reserved = 0;
+-	int cmd_written = 0;
++	BATCH_LOCALS(&rmesa->radeon);
+ 	int type, num_verts;
+-	drm_radeon_cmd_header_t *cmd = NULL;
+ 
+ 	type = r300PrimitiveType(rmesa, primitive);
+ 	num_verts = r300NumVerts(rmesa, vertex_nr, primitive);
+-	
+-	start_packet3(CP_PACKET3(R300_PACKET3_3D_DRAW_VBUF_2, 0), 0);
+-	e32(R300_VAP_VF_CNTL__PRIM_WALK_VERTEX_LIST | (num_verts << 16) | type);
++
++	BEGIN_BATCH(3);
++	OUT_BATCH_PACKET3(R300_PACKET3_3D_DRAW_VBUF_2, 0);
++	OUT_BATCH(R300_VAP_VF_CNTL__PRIM_WALK_VERTEX_LIST | (num_verts << 16) | type);
++	END_BATCH();
++}
++
++void r300_swtcl_flush(GLcontext *ctx, uint32_t current_offset)
++{
++  r300ContextPtr rmesa = R300_CONTEXT(ctx);
++
++  rcommonEnsureCmdBufSpace(&rmesa->radeon,
++			   rmesa->radeon.hw.max_state_size + (12*sizeof(int)),
++			   __FUNCTION__);
++  radeonEmitState(&rmesa->radeon);
++  r300EmitVertexAOS(rmesa,
++		    rmesa->radeon.swtcl.vertex_size,
++		    rmesa->radeon.dma.current,
++		    current_offset);
++  
++  r300EmitVbufPrim(rmesa,
++		   rmesa->radeon.swtcl.hw_primitive,
++		   rmesa->radeon.swtcl.numverts);
++  r300EmitCacheFlush(rmesa);
++  COMMIT_BATCH();
++
+ }
+diff --git a/src/mesa/drivers/dri/r300/r300_swtcl.h b/src/mesa/drivers/dri/r300/r300_swtcl.h
+index 55df53c..23b4ce3 100644
+--- a/src/mesa/drivers/dri/r300/r300_swtcl.h
++++ b/src/mesa/drivers/dri/r300/r300_swtcl.h
+@@ -42,4 +42,5 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ extern void r300InitSwtcl( GLcontext *ctx );
+ extern void r300DestroySwtcl( GLcontext *ctx );
+ 
++extern void r300_swtcl_flush(GLcontext *ctx, uint32_t current_offset);
+ #endif
+diff --git a/src/mesa/drivers/dri/r300/r300_tex.c b/src/mesa/drivers/dri/r300/r300_tex.c
+index 8ab382c..0f5afbf 100644
+--- a/src/mesa/drivers/dri/r300/r300_tex.c
++++ b/src/mesa/drivers/dri/r300/r300_tex.c
+@@ -38,6 +38,7 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ #include "main/context.h"
+ #include "main/enums.h"
+ #include "main/image.h"
++#include "main/mipmap.h"
+ #include "main/simple_list.h"
+ #include "main/texformat.h"
+ #include "main/texstore.h"
+@@ -49,6 +50,7 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ #include "r300_context.h"
+ #include "r300_state.h"
+ #include "r300_ioctl.h"
++#include "radeon_mipmap_tree.h"
+ #include "r300_tex.h"
+ 
+ #include "xmlpool.h"
+@@ -77,20 +79,20 @@ static unsigned int translate_wrap_mode(GLenum wrapmode)
+  *
+  * \param t Texture object whose wrap modes are to be set
+  */
+-static void r300UpdateTexWrap(r300TexObjPtr t)
++static void r300UpdateTexWrap(radeonTexObjPtr t)
+ {
+-	struct gl_texture_object *tObj = t->base.tObj;
++	struct gl_texture_object *tObj = &t->base;
+ 
+-	t->filter &=
++	t->pp_txfilter &=
+ 	    ~(R300_TX_WRAP_S_MASK | R300_TX_WRAP_T_MASK | R300_TX_WRAP_R_MASK);
+ 
+-	t->filter |= translate_wrap_mode(tObj->WrapS) << R300_TX_WRAP_S_SHIFT;
++	t->pp_txfilter |= translate_wrap_mode(tObj->WrapS) << R300_TX_WRAP_S_SHIFT;
+ 
+ 	if (tObj->Target != GL_TEXTURE_1D) {
+-		t->filter |= translate_wrap_mode(tObj->WrapT) << R300_TX_WRAP_T_SHIFT;
++		t->pp_txfilter |= translate_wrap_mode(tObj->WrapT) << R300_TX_WRAP_T_SHIFT;
+ 
+ 		if (tObj->Target == GL_TEXTURE_3D)
+-			t->filter |= translate_wrap_mode(tObj->WrapR) << R300_TX_WRAP_R_SHIFT;
++			t->pp_txfilter |= translate_wrap_mode(tObj->WrapR) << R300_TX_WRAP_R_SHIFT;
+ 	}
+ }
+ 
+@@ -117,10 +119,13 @@ static GLuint aniso_filter(GLfloat anisotropy)
+  * \param magf Texture magnification mode
+  * \param anisotropy Maximum anisotropy level
+  */
+-static void r300SetTexFilter(r300TexObjPtr t, GLenum minf, GLenum magf, GLfloat anisotropy)
++static void r300SetTexFilter(radeonTexObjPtr t, GLenum minf, GLenum magf, GLfloat anisotropy)
+ {
+-	t->filter &= ~(R300_TX_MIN_FILTER_MASK | R300_TX_MIN_FILTER_MIP_MASK | R300_TX_MAG_FILTER_MASK | R300_TX_MAX_ANISO_MASK);
+-	t->filter_1 &= ~R300_EDGE_ANISO_EDGE_ONLY;
++	/* Force revalidation to account for switches from/to mipmapping. */
++	t->validated = GL_FALSE;
++
++	t->pp_txfilter &= ~(R300_TX_MIN_FILTER_MASK | R300_TX_MIN_FILTER_MIP_MASK | R300_TX_MAG_FILTER_MASK | R300_TX_MAX_ANISO_MASK);
++	t->pp_txfilter_1 &= ~R300_EDGE_ANISO_EDGE_ONLY;
+ 
+ 	/* Note that EXT_texture_filter_anisotropic is extremely vague about
+ 	 * how anisotropic filtering interacts with the "normal" filter modes.
+@@ -128,7 +133,7 @@ static void r300SetTexFilter(r300TexObjPtr t, GLenum minf, GLenum magf, GLfloat
+ 	 * filter settings completely. This includes driconf's settings.
+ 	 */
+ 	if (anisotropy >= 2.0 && (minf != GL_NEAREST) && (magf != GL_NEAREST)) {
+-		t->filter |= R300_TX_MAG_FILTER_ANISO
++		t->pp_txfilter |= R300_TX_MAG_FILTER_ANISO
+ 			| R300_TX_MIN_FILTER_ANISO
+ 			| R300_TX_MIN_FILTER_MIP_LINEAR
+ 			| aniso_filter(anisotropy);
+@@ -139,22 +144,22 @@ static void r300SetTexFilter(r300TexObjPtr t, GLenum minf, GLenum magf, GLfloat
+ 
+ 	switch (minf) {
+ 	case GL_NEAREST:
+-		t->filter |= R300_TX_MIN_FILTER_NEAREST;
++		t->pp_txfilter |= R300_TX_MIN_FILTER_NEAREST;
+ 		break;
+ 	case GL_LINEAR:
+-		t->filter |= R300_TX_MIN_FILTER_LINEAR;
++		t->pp_txfilter |= R300_TX_MIN_FILTER_LINEAR;
+ 		break;
+ 	case GL_NEAREST_MIPMAP_NEAREST:
+-		t->filter |= R300_TX_MIN_FILTER_NEAREST|R300_TX_MIN_FILTER_MIP_NEAREST;
++		t->pp_txfilter |= R300_TX_MIN_FILTER_NEAREST|R300_TX_MIN_FILTER_MIP_NEAREST;
+ 		break;
+ 	case GL_NEAREST_MIPMAP_LINEAR:
+-		t->filter |= R300_TX_MIN_FILTER_NEAREST|R300_TX_MIN_FILTER_MIP_LINEAR;
++		t->pp_txfilter |= R300_TX_MIN_FILTER_NEAREST|R300_TX_MIN_FILTER_MIP_LINEAR;
+ 		break;
+ 	case GL_LINEAR_MIPMAP_NEAREST:
+-		t->filter |= R300_TX_MIN_FILTER_LINEAR|R300_TX_MIN_FILTER_MIP_NEAREST;
++		t->pp_txfilter |= R300_TX_MIN_FILTER_LINEAR|R300_TX_MIN_FILTER_MIP_NEAREST;
+ 		break;
+ 	case GL_LINEAR_MIPMAP_LINEAR:
+-		t->filter |= R300_TX_MIN_FILTER_LINEAR|R300_TX_MIN_FILTER_MIP_LINEAR;
++		t->pp_txfilter |= R300_TX_MIN_FILTER_LINEAR|R300_TX_MIN_FILTER_MIP_LINEAR;
+ 		break;
+ 	}
+ 
+@@ -163,743 +168,20 @@ static void r300SetTexFilter(r300TexObjPtr t, GLenum minf, GLenum magf, GLfloat
+ 	 */
+ 	switch (magf) {
+ 	case GL_NEAREST:
+-		t->filter |= R300_TX_MAG_FILTER_NEAREST;
++		t->pp_txfilter |= R300_TX_MAG_FILTER_NEAREST;
+ 		break;
+ 	case GL_LINEAR:
+-		t->filter |= R300_TX_MAG_FILTER_LINEAR;
++		t->pp_txfilter |= R300_TX_MAG_FILTER_LINEAR;
+ 		break;
+ 	}
+ }
+ 
+-static void r300SetTexBorderColor(r300TexObjPtr t, GLubyte c[4])
++static void r300SetTexBorderColor(radeonTexObjPtr t, GLubyte c[4])
+ {
+ 	t->pp_border_color = PACK_COLOR_8888(c[3], c[0], c[1], c[2]);
+ }
+ 
+ /**
+- * Allocate space for and load the mesa images into the texture memory block.
+- * This will happen before drawing with a new texture, or drawing with a
+- * texture after it was swapped out or teximaged again.
+- */
+-
+-static r300TexObjPtr r300AllocTexObj(struct gl_texture_object *texObj)
+-{
+-	r300TexObjPtr t;
+-
+-	t = CALLOC_STRUCT(r300_tex_obj);
+-	texObj->DriverData = t;
+-	if (t != NULL) {
+-		if (RADEON_DEBUG & DEBUG_TEXTURE) {
+-			fprintf(stderr, "%s( %p, %p )\n", __FUNCTION__,
+-				(void *)texObj, (void *)t);
+-		}
+-
+-		/* Initialize non-image-dependent parts of the state:
+-		 */
+-		t->base.tObj = texObj;
+-		t->border_fallback = GL_FALSE;
+-
+-		make_empty_list(&t->base);
+-
+-		r300UpdateTexWrap(t);
+-		r300SetTexFilter(t, texObj->MinFilter, texObj->MagFilter, texObj->MaxAnisotropy);
+-		r300SetTexBorderColor(t, texObj->_BorderChan);
+-	}
+-
+-	return t;
+-}
+-
+-/* try to find a format which will only need a memcopy */
+-static const struct gl_texture_format *r300Choose8888TexFormat(GLenum srcFormat,
+-							       GLenum srcType)
+-{
+-	const GLuint ui = 1;
+-	const GLubyte littleEndian = *((const GLubyte *)&ui);
+-
+-	if ((srcFormat == GL_RGBA && srcType == GL_UNSIGNED_INT_8_8_8_8) ||
+-	    (srcFormat == GL_RGBA && srcType == GL_UNSIGNED_BYTE && !littleEndian) ||
+-	    (srcFormat == GL_ABGR_EXT && srcType == GL_UNSIGNED_INT_8_8_8_8_REV) ||
+-	    (srcFormat == GL_ABGR_EXT && srcType == GL_UNSIGNED_BYTE && littleEndian)) {
+-		return &_mesa_texformat_rgba8888;
+-	} else if ((srcFormat == GL_RGBA && srcType == GL_UNSIGNED_INT_8_8_8_8_REV) ||
+-		   (srcFormat == GL_RGBA && srcType == GL_UNSIGNED_BYTE && littleEndian) ||
+-		   (srcFormat == GL_ABGR_EXT && srcType == GL_UNSIGNED_INT_8_8_8_8) ||
+-		   (srcFormat == GL_ABGR_EXT && srcType == GL_UNSIGNED_BYTE && !littleEndian)) {
+-		return &_mesa_texformat_rgba8888_rev;
+-	} else if (srcFormat == GL_BGRA && ((srcType == GL_UNSIGNED_BYTE && !littleEndian) ||
+-					    srcType == GL_UNSIGNED_INT_8_8_8_8)) {
+-		return &_mesa_texformat_argb8888_rev;
+-	} else if (srcFormat == GL_BGRA && ((srcType == GL_UNSIGNED_BYTE && littleEndian) ||
+-					    srcType == GL_UNSIGNED_INT_8_8_8_8_REV)) {
+-		return &_mesa_texformat_argb8888;
+-	} else
+-		return _dri_texformat_argb8888;
+-}
+-
+-static const struct gl_texture_format *r300ChooseTextureFormat(GLcontext * ctx,
+-							       GLint
+-							       internalFormat,
+-							       GLenum format,
+-							       GLenum type)
+-{
+-	r300ContextPtr rmesa = R300_CONTEXT(ctx);
+-	const GLboolean do32bpt =
+-	    (rmesa->texture_depth == DRI_CONF_TEXTURE_DEPTH_32);
+-	const GLboolean force16bpt =
+-	    (rmesa->texture_depth == DRI_CONF_TEXTURE_DEPTH_FORCE_16);
+-	(void)format;
+-
+-#if 0
+-	fprintf(stderr, "InternalFormat=%s(%d) type=%s format=%s\n",
+-		_mesa_lookup_enum_by_nr(internalFormat), internalFormat,
+-		_mesa_lookup_enum_by_nr(type), _mesa_lookup_enum_by_nr(format));
+-	fprintf(stderr, "do32bpt=%d force16bpt=%d\n", do32bpt, force16bpt);
+-#endif
+-
+-	switch (internalFormat) {
+-	case 4:
+-	case GL_RGBA:
+-	case GL_COMPRESSED_RGBA:
+-		switch (type) {
+-		case GL_UNSIGNED_INT_10_10_10_2:
+-		case GL_UNSIGNED_INT_2_10_10_10_REV:
+-			return do32bpt ? _dri_texformat_argb8888 :
+-			    _dri_texformat_argb1555;
+-		case GL_UNSIGNED_SHORT_4_4_4_4:
+-		case GL_UNSIGNED_SHORT_4_4_4_4_REV:
+-			return _dri_texformat_argb4444;
+-		case GL_UNSIGNED_SHORT_5_5_5_1:
+-		case GL_UNSIGNED_SHORT_1_5_5_5_REV:
+-			return _dri_texformat_argb1555;
+-		default:
+-			return do32bpt ? r300Choose8888TexFormat(format, type) :
+-			    _dri_texformat_argb4444;
+-		}
+-
+-	case 3:
+-	case GL_RGB:
+-	case GL_COMPRESSED_RGB:
+-		switch (type) {
+-		case GL_UNSIGNED_SHORT_4_4_4_4:
+-		case GL_UNSIGNED_SHORT_4_4_4_4_REV:
+-			return _dri_texformat_argb4444;
+-		case GL_UNSIGNED_SHORT_5_5_5_1:
+-		case GL_UNSIGNED_SHORT_1_5_5_5_REV:
+-			return _dri_texformat_argb1555;
+-		case GL_UNSIGNED_SHORT_5_6_5:
+-		case GL_UNSIGNED_SHORT_5_6_5_REV:
+-			return _dri_texformat_rgb565;
+-		default:
+-			return do32bpt ? _dri_texformat_argb8888 :
+-			    _dri_texformat_rgb565;
+-		}
+-
+-	case GL_RGBA8:
+-	case GL_RGB10_A2:
+-	case GL_RGBA12:
+-	case GL_RGBA16:
+-		return !force16bpt ?
+-		    r300Choose8888TexFormat(format,
+-					    type) : _dri_texformat_argb4444;
+-
+-	case GL_RGBA4:
+-	case GL_RGBA2:
+-		return _dri_texformat_argb4444;
+-
+-	case GL_RGB5_A1:
+-		return _dri_texformat_argb1555;
+-
+-	case GL_RGB8:
+-	case GL_RGB10:
+-	case GL_RGB12:
+-	case GL_RGB16:
+-		return !force16bpt ? _dri_texformat_argb8888 :
+-		    _dri_texformat_rgb565;
+-
+-	case GL_RGB5:
+-	case GL_RGB4:
+-	case GL_R3_G3_B2:
+-		return _dri_texformat_rgb565;
+-
+-	case GL_ALPHA:
+-	case GL_ALPHA4:
+-	case GL_ALPHA8:
+-	case GL_ALPHA12:
+-	case GL_ALPHA16:
+-	case GL_COMPRESSED_ALPHA:
+-		return _dri_texformat_a8;
+-
+-	case 1:
+-	case GL_LUMINANCE:
+-	case GL_LUMINANCE4:
+-	case GL_LUMINANCE8:
+-	case GL_LUMINANCE12:
+-	case GL_LUMINANCE16:
+-	case GL_COMPRESSED_LUMINANCE:
+-		return _dri_texformat_l8;
+-
+-	case 2:
+-	case GL_LUMINANCE_ALPHA:
+-	case GL_LUMINANCE4_ALPHA4:
+-	case GL_LUMINANCE6_ALPHA2:
+-	case GL_LUMINANCE8_ALPHA8:
+-	case GL_LUMINANCE12_ALPHA4:
+-	case GL_LUMINANCE12_ALPHA12:
+-	case GL_LUMINANCE16_ALPHA16:
+-	case GL_COMPRESSED_LUMINANCE_ALPHA:
+-		return _dri_texformat_al88;
+-
+-	case GL_INTENSITY:
+-	case GL_INTENSITY4:
+-	case GL_INTENSITY8:
+-	case GL_INTENSITY12:
+-	case GL_INTENSITY16:
+-	case GL_COMPRESSED_INTENSITY:
+-		return _dri_texformat_i8;
+-
+-	case GL_YCBCR_MESA:
+-		if (type == GL_UNSIGNED_SHORT_8_8_APPLE ||
+-		    type == GL_UNSIGNED_BYTE)
+-			return &_mesa_texformat_ycbcr;
+-		else
+-			return &_mesa_texformat_ycbcr_rev;
+-
+-	case GL_RGB_S3TC:
+-	case GL_RGB4_S3TC:
+-	case GL_COMPRESSED_RGB_S3TC_DXT1_EXT:
+-		return &_mesa_texformat_rgb_dxt1;
+-
+-	case GL_COMPRESSED_RGBA_S3TC_DXT1_EXT:
+-		return &_mesa_texformat_rgba_dxt1;
+-
+-	case GL_RGBA_S3TC:
+-	case GL_RGBA4_S3TC:
+-	case GL_COMPRESSED_RGBA_S3TC_DXT3_EXT:
+-		return &_mesa_texformat_rgba_dxt3;
+-
+-	case GL_COMPRESSED_RGBA_S3TC_DXT5_EXT:
+-		return &_mesa_texformat_rgba_dxt5;
+-
+-	case GL_ALPHA16F_ARB:
+-		return &_mesa_texformat_alpha_float16;
+-	case GL_ALPHA32F_ARB:
+-		return &_mesa_texformat_alpha_float32;
+-	case GL_LUMINANCE16F_ARB:
+-		return &_mesa_texformat_luminance_float16;
+-	case GL_LUMINANCE32F_ARB:
+-		return &_mesa_texformat_luminance_float32;
+-	case GL_LUMINANCE_ALPHA16F_ARB:
+-		return &_mesa_texformat_luminance_alpha_float16;
+-	case GL_LUMINANCE_ALPHA32F_ARB:
+-		return &_mesa_texformat_luminance_alpha_float32;
+-	case GL_INTENSITY16F_ARB:
+-		return &_mesa_texformat_intensity_float16;
+-	case GL_INTENSITY32F_ARB:
+-		return &_mesa_texformat_intensity_float32;
+-	case GL_RGB16F_ARB:
+-		return &_mesa_texformat_rgba_float16;
+-	case GL_RGB32F_ARB:
+-		return &_mesa_texformat_rgba_float32;
+-	case GL_RGBA16F_ARB:
+-		return &_mesa_texformat_rgba_float16;
+-	case GL_RGBA32F_ARB:
+-		return &_mesa_texformat_rgba_float32;
+-
+-	case GL_DEPTH_COMPONENT:
+-	case GL_DEPTH_COMPONENT16:
+-	case GL_DEPTH_COMPONENT24:
+-	case GL_DEPTH_COMPONENT32:
+-#if 0
+-		switch (type) {
+-		case GL_UNSIGNED_BYTE:
+-		case GL_UNSIGNED_SHORT:
+-			return &_mesa_texformat_z16;
+-		case GL_UNSIGNED_INT:
+-			return &_mesa_texformat_z32;
+-		case GL_UNSIGNED_INT_24_8_EXT:
+-		default:
+-			return &_mesa_texformat_z24_s8;
+-		}
+-#else
+-		return &_mesa_texformat_z16;
+-#endif
+-
+-	default:
+-		_mesa_problem(ctx,
+-			      "unexpected internalFormat 0x%x in r300ChooseTextureFormat",
+-			      (int)internalFormat);
+-		return NULL;
+-	}
+-
+-	return NULL;		/* never get here */
+-}
+-
+-static GLboolean
+-r300ValidateClientStorage(GLcontext * ctx, GLenum target,
+-			  GLint internalFormat,
+-			  GLint srcWidth, GLint srcHeight,
+-			  GLenum format, GLenum type, const void *pixels,
+-			  const struct gl_pixelstore_attrib *packing,
+-			  struct gl_texture_object *texObj,
+-			  struct gl_texture_image *texImage)
+-{
+-	r300ContextPtr rmesa = R300_CONTEXT(ctx);
+-
+-	if (RADEON_DEBUG & DEBUG_TEXTURE)
+-		fprintf(stderr, "intformat %s format %s type %s\n",
+-			_mesa_lookup_enum_by_nr(internalFormat),
+-			_mesa_lookup_enum_by_nr(format),
+-			_mesa_lookup_enum_by_nr(type));
+-
+-	if (!ctx->Unpack.ClientStorage)
+-		return 0;
+-
+-	if (ctx->_ImageTransferState ||
+-	    texImage->IsCompressed || texObj->GenerateMipmap)
+-		return 0;
+-
+-	/* This list is incomplete, may be different on ppc???
+-	 */
+-	switch (internalFormat) {
+-	case GL_RGBA:
+-		if (format == GL_BGRA && type == GL_UNSIGNED_INT_8_8_8_8_REV) {
+-			texImage->TexFormat = _dri_texformat_argb8888;
+-		} else
+-			return 0;
+-		break;
+-
+-	case GL_RGB:
+-		if (format == GL_RGB && type == GL_UNSIGNED_SHORT_5_6_5) {
+-			texImage->TexFormat = _dri_texformat_rgb565;
+-		} else
+-			return 0;
+-		break;
+-
+-	case GL_YCBCR_MESA:
+-		if (format == GL_YCBCR_MESA &&
+-		    type == GL_UNSIGNED_SHORT_8_8_REV_APPLE) {
+-			texImage->TexFormat = &_mesa_texformat_ycbcr_rev;
+-		} else if (format == GL_YCBCR_MESA &&
+-			   (type == GL_UNSIGNED_SHORT_8_8_APPLE ||
+-			    type == GL_UNSIGNED_BYTE)) {
+-			texImage->TexFormat = &_mesa_texformat_ycbcr;
+-		} else
+-			return 0;
+-		break;
+-
+-	default:
+-		return 0;
+-	}
+-
+-	/* Could deal with these packing issues, but currently don't:
+-	 */
+-	if (packing->SkipPixels ||
+-	    packing->SkipRows || packing->SwapBytes || packing->LsbFirst) {
+-		return 0;
+-	}
+-
+-	GLint srcRowStride = _mesa_image_row_stride(packing, srcWidth,
+-						    format, type);
+-
+-	if (RADEON_DEBUG & DEBUG_TEXTURE)
+-		fprintf(stderr, "%s: srcRowStride %d/%x\n",
+-			__FUNCTION__, srcRowStride, srcRowStride);
+-
+-	/* Could check this later in upload, pitch restrictions could be
+-	 * relaxed, but would need to store the image pitch somewhere,
+-	 * as packing details might change before image is uploaded:
+-	 */
+-	if (!r300IsGartMemory(rmesa, pixels, srcHeight * srcRowStride)
+-	    || (srcRowStride & 63))
+-		return 0;
+-
+-	/* Have validated that _mesa_transfer_teximage would be a straight
+-	 * memcpy at this point.  NOTE: future calls to TexSubImage will
+-	 * overwrite the client data.  This is explicitly mentioned in the
+-	 * extension spec.
+-	 */
+-	texImage->Data = (void *)pixels;
+-	texImage->IsClientData = GL_TRUE;
+-	texImage->RowStride = srcRowStride / texImage->TexFormat->TexelBytes;
+-
+-	return 1;
+-}
+-
+-static void r300TexImage1D(GLcontext * ctx, GLenum target, GLint level,
+-			   GLint internalFormat,
+-			   GLint width, GLint border,
+-			   GLenum format, GLenum type, const GLvoid * pixels,
+-			   const struct gl_pixelstore_attrib *packing,
+-			   struct gl_texture_object *texObj,
+-			   struct gl_texture_image *texImage)
+-{
+-	driTextureObject *t = (driTextureObject *) texObj->DriverData;
+-
+-	if (t) {
+-		driSwapOutTextureObject(t);
+-	} else {
+-		t = (driTextureObject *) r300AllocTexObj(texObj);
+-		if (!t) {
+-			_mesa_error(ctx, GL_OUT_OF_MEMORY, "glTexImage1D");
+-			return;
+-		}
+-	}
+-
+-	/* Note, this will call ChooseTextureFormat */
+-	_mesa_store_teximage1d(ctx, target, level, internalFormat,
+-			       width, border, format, type, pixels,
+-			       &ctx->Unpack, texObj, texImage);
+-
+-	t->dirty_images[0] |= (1 << level);
+-}
+-
+-static void r300TexSubImage1D(GLcontext * ctx, GLenum target, GLint level,
+-			      GLint xoffset,
+-			      GLsizei width,
+-			      GLenum format, GLenum type,
+-			      const GLvoid * pixels,
+-			      const struct gl_pixelstore_attrib *packing,
+-			      struct gl_texture_object *texObj,
+-			      struct gl_texture_image *texImage)
+-{
+-	driTextureObject *t = (driTextureObject *) texObj->DriverData;
+-
+-	assert(t);		/* this _should_ be true */
+-	if (t) {
+-		driSwapOutTextureObject(t);
+-	} else {
+-		t = (driTextureObject *) r300AllocTexObj(texObj);
+-		if (!t) {
+-			_mesa_error(ctx, GL_OUT_OF_MEMORY, "glTexSubImage1D");
+-			return;
+-		}
+-	}
+-
+-	_mesa_store_texsubimage1d(ctx, target, level, xoffset, width,
+-				  format, type, pixels, packing, texObj,
+-				  texImage);
+-
+-	t->dirty_images[0] |= (1 << level);
+-}
+-
+-static void r300TexImage2D(GLcontext * ctx, GLenum target, GLint level,
+-			   GLint internalFormat,
+-			   GLint width, GLint height, GLint border,
+-			   GLenum format, GLenum type, const GLvoid * pixels,
+-			   const struct gl_pixelstore_attrib *packing,
+-			   struct gl_texture_object *texObj,
+-			   struct gl_texture_image *texImage)
+-{
+-	driTextureObject *t = (driTextureObject *) texObj->DriverData;
+-	GLuint face;
+-
+-	/* which cube face or ordinary 2D image */
+-	switch (target) {
+-	case GL_TEXTURE_CUBE_MAP_POSITIVE_X:
+-	case GL_TEXTURE_CUBE_MAP_NEGATIVE_X:
+-	case GL_TEXTURE_CUBE_MAP_POSITIVE_Y:
+-	case GL_TEXTURE_CUBE_MAP_NEGATIVE_Y:
+-	case GL_TEXTURE_CUBE_MAP_POSITIVE_Z:
+-	case GL_TEXTURE_CUBE_MAP_NEGATIVE_Z:
+-		face =
+-		    (GLuint) target - (GLuint) GL_TEXTURE_CUBE_MAP_POSITIVE_X;
+-		ASSERT(face < 6);
+-		break;
+-	default:
+-		face = 0;
+-	}
+-
+-	if (t != NULL) {
+-		driSwapOutTextureObject(t);
+-	} else {
+-		t = (driTextureObject *) r300AllocTexObj(texObj);
+-		if (!t) {
+-			_mesa_error(ctx, GL_OUT_OF_MEMORY, "glTexImage2D");
+-			return;
+-		}
+-	}
+-
+-	texImage->IsClientData = GL_FALSE;
+-
+-	if (r300ValidateClientStorage(ctx, target,
+-				      internalFormat,
+-				      width, height,
+-				      format, type, pixels,
+-				      packing, texObj, texImage)) {
+-		if (RADEON_DEBUG & DEBUG_TEXTURE)
+-			fprintf(stderr, "%s: Using client storage\n",
+-				__FUNCTION__);
+-	} else {
+-		if (RADEON_DEBUG & DEBUG_TEXTURE)
+-			fprintf(stderr, "%s: Using normal storage\n",
+-				__FUNCTION__);
+-
+-		/* Normal path: copy (to cached memory) and eventually upload
+-		 * via another copy to GART memory and then a blit...  Could
+-		 * eliminate one copy by going straight to (permanent) GART.
+-		 *
+-		 * Note, this will call r300ChooseTextureFormat.
+-		 */
+-		_mesa_store_teximage2d(ctx, target, level, internalFormat,
+-				       width, height, border, format, type,
+-				       pixels, &ctx->Unpack, texObj, texImage);
+-
+-		t->dirty_images[face] |= (1 << level);
+-	}
+-}
+-
+-static void r300TexSubImage2D(GLcontext * ctx, GLenum target, GLint level,
+-			      GLint xoffset, GLint yoffset,
+-			      GLsizei width, GLsizei height,
+-			      GLenum format, GLenum type,
+-			      const GLvoid * pixels,
+-			      const struct gl_pixelstore_attrib *packing,
+-			      struct gl_texture_object *texObj,
+-			      struct gl_texture_image *texImage)
+-{
+-	driTextureObject *t = (driTextureObject *) texObj->DriverData;
+-	GLuint face;
+-
+-	/* which cube face or ordinary 2D image */
+-	switch (target) {
+-	case GL_TEXTURE_CUBE_MAP_POSITIVE_X:
+-	case GL_TEXTURE_CUBE_MAP_NEGATIVE_X:
+-	case GL_TEXTURE_CUBE_MAP_POSITIVE_Y:
+-	case GL_TEXTURE_CUBE_MAP_NEGATIVE_Y:
+-	case GL_TEXTURE_CUBE_MAP_POSITIVE_Z:
+-	case GL_TEXTURE_CUBE_MAP_NEGATIVE_Z:
+-		face =
+-		    (GLuint) target - (GLuint) GL_TEXTURE_CUBE_MAP_POSITIVE_X;
+-		ASSERT(face < 6);
+-		break;
+-	default:
+-		face = 0;
+-	}
+-
+-	assert(t);		/* this _should_ be true */
+-	if (t) {
+-		driSwapOutTextureObject(t);
+-	} else {
+-		t = (driTextureObject *) r300AllocTexObj(texObj);
+-		if (!t) {
+-			_mesa_error(ctx, GL_OUT_OF_MEMORY, "glTexSubImage2D");
+-			return;
+-		}
+-	}
+-
+-	_mesa_store_texsubimage2d(ctx, target, level, xoffset, yoffset, width,
+-				  height, format, type, pixels, packing, texObj,
+-				  texImage);
+-
+-	t->dirty_images[face] |= (1 << level);
+-}
+-
+-static void r300CompressedTexImage2D(GLcontext * ctx, GLenum target,
+-				     GLint level, GLint internalFormat,
+-				     GLint width, GLint height, GLint border,
+-				     GLsizei imageSize, const GLvoid * data,
+-				     struct gl_texture_object *texObj,
+-				     struct gl_texture_image *texImage)
+-{
+-	driTextureObject *t = (driTextureObject *) texObj->DriverData;
+-	GLuint face;
+-
+-	/* which cube face or ordinary 2D image */
+-	switch (target) {
+-	case GL_TEXTURE_CUBE_MAP_POSITIVE_X:
+-	case GL_TEXTURE_CUBE_MAP_NEGATIVE_X:
+-	case GL_TEXTURE_CUBE_MAP_POSITIVE_Y:
+-	case GL_TEXTURE_CUBE_MAP_NEGATIVE_Y:
+-	case GL_TEXTURE_CUBE_MAP_POSITIVE_Z:
+-	case GL_TEXTURE_CUBE_MAP_NEGATIVE_Z:
+-		face =
+-		    (GLuint) target - (GLuint) GL_TEXTURE_CUBE_MAP_POSITIVE_X;
+-		ASSERT(face < 6);
+-		break;
+-	default:
+-		face = 0;
+-	}
+-
+-	if (t != NULL) {
+-		driSwapOutTextureObject(t);
+-	} else {
+-		t = (driTextureObject *) r300AllocTexObj(texObj);
+-		if (!t) {
+-			_mesa_error(ctx, GL_OUT_OF_MEMORY,
+-				    "glCompressedTexImage2D");
+-			return;
+-		}
+-	}
+-
+-	texImage->IsClientData = GL_FALSE;
+-
+-	/* can't call this, different parameters. Would never evaluate to true anyway currently */
+-#if 0
+-	if (r300ValidateClientStorage(ctx, target,
+-				      internalFormat,
+-				      width, height,
+-				      format, type, pixels,
+-				      packing, texObj, texImage)) {
+-		if (RADEON_DEBUG & DEBUG_TEXTURE)
+-			fprintf(stderr, "%s: Using client storage\n",
+-				__FUNCTION__);
+-	} else
+-#endif
+-	{
+-		if (RADEON_DEBUG & DEBUG_TEXTURE)
+-			fprintf(stderr, "%s: Using normal storage\n",
+-				__FUNCTION__);
+-
+-		/* Normal path: copy (to cached memory) and eventually upload
+-		 * via another copy to GART memory and then a blit...  Could
+-		 * eliminate one copy by going straight to (permanent) GART.
+-		 *
+-		 * Note, this will call r300ChooseTextureFormat.
+-		 */
+-		_mesa_store_compressed_teximage2d(ctx, target, level,
+-						  internalFormat, width, height,
+-						  border, imageSize, data,
+-						  texObj, texImage);
+-
+-		t->dirty_images[face] |= (1 << level);
+-	}
+-}
+-
+-static void r300CompressedTexSubImage2D(GLcontext * ctx, GLenum target,
+-					GLint level, GLint xoffset,
+-					GLint yoffset, GLsizei width,
+-					GLsizei height, GLenum format,
+-					GLsizei imageSize, const GLvoid * data,
+-					struct gl_texture_object *texObj,
+-					struct gl_texture_image *texImage)
+-{
+-	driTextureObject *t = (driTextureObject *) texObj->DriverData;
+-	GLuint face;
+-
+-	/* which cube face or ordinary 2D image */
+-	switch (target) {
+-	case GL_TEXTURE_CUBE_MAP_POSITIVE_X:
+-	case GL_TEXTURE_CUBE_MAP_NEGATIVE_X:
+-	case GL_TEXTURE_CUBE_MAP_POSITIVE_Y:
+-	case GL_TEXTURE_CUBE_MAP_NEGATIVE_Y:
+-	case GL_TEXTURE_CUBE_MAP_POSITIVE_Z:
+-	case GL_TEXTURE_CUBE_MAP_NEGATIVE_Z:
+-		face =
+-		    (GLuint) target - (GLuint) GL_TEXTURE_CUBE_MAP_POSITIVE_X;
+-		ASSERT(face < 6);
+-		break;
+-	default:
+-		face = 0;
+-	}
+-
+-	assert(t);		/* this _should_ be true */
+-	if (t) {
+-		driSwapOutTextureObject(t);
+-	} else {
+-		t = (driTextureObject *) r300AllocTexObj(texObj);
+-		if (!t) {
+-			_mesa_error(ctx, GL_OUT_OF_MEMORY,
+-				    "glCompressedTexSubImage3D");
+-			return;
+-		}
+-	}
+-
+-	_mesa_store_compressed_texsubimage2d(ctx, target, level, xoffset,
+-					     yoffset, width, height, format,
+-					     imageSize, data, texObj, texImage);
+-
+-	t->dirty_images[face] |= (1 << level);
+-}
+-
+-static void r300TexImage3D(GLcontext * ctx, GLenum target, GLint level,
+-			   GLint internalFormat,
+-			   GLint width, GLint height, GLint depth,
+-			   GLint border,
+-			   GLenum format, GLenum type, const GLvoid * pixels,
+-			   const struct gl_pixelstore_attrib *packing,
+-			   struct gl_texture_object *texObj,
+-			   struct gl_texture_image *texImage)
+-{
+-	driTextureObject *t = (driTextureObject *) texObj->DriverData;
+-
+-	if (t) {
+-		driSwapOutTextureObject(t);
+-	} else {
+-		t = (driTextureObject *) r300AllocTexObj(texObj);
+-		if (!t) {
+-			_mesa_error(ctx, GL_OUT_OF_MEMORY, "glTexImage3D");
+-			return;
+-		}
+-	}
+-
+-	texImage->IsClientData = GL_FALSE;
+-
+-#if 0
+-	if (r300ValidateClientStorage(ctx, target,
+-				      internalFormat,
+-				      width, height,
+-				      format, type, pixels,
+-				      packing, texObj, texImage)) {
+-		if (RADEON_DEBUG & DEBUG_TEXTURE)
+-			fprintf(stderr, "%s: Using client storage\n",
+-				__FUNCTION__);
+-	} else
+-#endif
+-	{
+-		if (RADEON_DEBUG & DEBUG_TEXTURE)
+-			fprintf(stderr, "%s: Using normal storage\n",
+-				__FUNCTION__);
+-
+-		/* Normal path: copy (to cached memory) and eventually upload
+-		 * via another copy to GART memory and then a blit...  Could
+-		 * eliminate one copy by going straight to (permanent) GART.
+-		 *
+-		 * Note, this will call r300ChooseTextureFormat.
+-		 */
+-		_mesa_store_teximage3d(ctx, target, level, internalFormat,
+-				       width, height, depth, border,
+-				       format, type, pixels,
+-				       &ctx->Unpack, texObj, texImage);
+-
+-		t->dirty_images[0] |= (1 << level);
+-	}
+-}
+-
+-static void
+-r300TexSubImage3D(GLcontext * ctx, GLenum target, GLint level,
+-		  GLint xoffset, GLint yoffset, GLint zoffset,
+-		  GLsizei width, GLsizei height, GLsizei depth,
+-		  GLenum format, GLenum type,
+-		  const GLvoid * pixels,
+-		  const struct gl_pixelstore_attrib *packing,
+-		  struct gl_texture_object *texObj,
+-		  struct gl_texture_image *texImage)
+-{
+-	driTextureObject *t = (driTextureObject *) texObj->DriverData;
+-
+-/*     fprintf(stderr, "%s\n", __FUNCTION__); */
+-
+-	assert(t);		/* this _should_ be true */
+-	if (t) {
+-		driSwapOutTextureObject(t);
+-	} else {
+-		t = (driTextureObject *) r300AllocTexObj(texObj);
+-		if (!t) {
+-			_mesa_error(ctx, GL_OUT_OF_MEMORY, "glTexSubImage3D");
+-			return;
+-		}
+-		texObj->DriverData = t;
+-	}
+-
+-	_mesa_store_texsubimage3d(ctx, target, level, xoffset, yoffset, zoffset,
+-				  width, height, depth,
+-				  format, type, pixels, packing, texObj,
+-				  texImage);
+-
+-	t->dirty_images[0] |= (1 << level);
+-}
+-
+-/**
+  * Changes variables and flags for a state update, which will happen at the
+  * next UpdateTextureState
+  */
+@@ -908,7 +190,7 @@ static void r300TexParameter(GLcontext * ctx, GLenum target,
+ 			     struct gl_texture_object *texObj,
+ 			     GLenum pname, const GLfloat * params)
+ {
+-	r300TexObjPtr t = (r300TexObjPtr) texObj->DriverData;
++	radeonTexObj* t = radeon_tex_obj(texObj);
+ 
+ 	if (RADEON_DEBUG & (DEBUG_STATE | DEBUG_TEXTURE)) {
+ 		fprintf(stderr, "%s( %s )\n", __FUNCTION__,
+@@ -941,7 +223,11 @@ static void r300TexParameter(GLcontext * ctx, GLenum target,
+ 		 * we just have to rely on loading the right subset of mipmap levels
+ 		 * to simulate a clamped LOD.
+ 		 */
+-		driSwapOutTextureObject((driTextureObject *) t);
++		if (t->mt) {
++			radeon_miptree_unreference(t->mt);
++			t->mt = 0;
++			t->validated = GL_FALSE;
++		}
+ 		break;
+ 
+ 	case GL_DEPTH_TEXTURE_MODE:
+@@ -964,27 +250,10 @@ static void r300TexParameter(GLcontext * ctx, GLenum target,
+ 	}
+ }
+ 
+-static void r300BindTexture(GLcontext * ctx, GLenum target,
+-			    struct gl_texture_object *texObj)
+-{
+-	if (RADEON_DEBUG & (DEBUG_STATE | DEBUG_TEXTURE)) {
+-		fprintf(stderr, "%s( %p ) unit=%d\n", __FUNCTION__,
+-			(void *)texObj, ctx->Texture.CurrentUnit);
+-	}
+-
+-	if ((target == GL_TEXTURE_1D)
+-	    || (target == GL_TEXTURE_2D)
+-	    || (target == GL_TEXTURE_3D)
+-	    || (target == GL_TEXTURE_CUBE_MAP)
+-	    || (target == GL_TEXTURE_RECTANGLE_NV)) {
+-		assert(texObj->DriverData != NULL);
+-	}
+-}
+-
+ static void r300DeleteTexture(GLcontext * ctx, struct gl_texture_object *texObj)
+ {
+ 	r300ContextPtr rmesa = R300_CONTEXT(ctx);
+-	driTextureObject *t = (driTextureObject *) texObj->DriverData;
++	radeonTexObj* t = radeon_tex_obj(texObj);
+ 
+ 	if (RADEON_DEBUG & (DEBUG_STATE | DEBUG_TEXTURE)) {
+ 		fprintf(stderr, "%s( %p (target = %s) )\n", __FUNCTION__,
+@@ -992,14 +261,24 @@ static void r300DeleteTexture(GLcontext * ctx, struct gl_texture_object *texObj)
+ 			_mesa_lookup_enum_by_nr(texObj->Target));
+ 	}
+ 
+-	if (t != NULL) {
+-		if (rmesa) {
+-			R300_FIREVERTICES(rmesa);
+-		}
++	if (rmesa) {
++		int i;
++		radeon_firevertices(&rmesa->radeon);
++
++		for(i = 0; i < R300_MAX_TEXTURE_UNITS; ++i)
++			if (rmesa->hw.textures[i] == t)
++				rmesa->hw.textures[i] = 0;
++	}
+ 
+-		driDestroyTextureObject(t);
++	if (t->bo) {
++		radeon_bo_unref(t->bo);
++		t->bo = NULL;
++	}
++
++	if (t->mt) {
++		radeon_miptree_unreference(t->mt);
++		t->mt = 0;
+ 	}
+-	/* Free mipmap images and the texture object itself */
+ 	_mesa_delete_texture_object(ctx, texObj);
+ }
+ 
+@@ -1008,8 +287,6 @@ static void r300DeleteTexture(GLcontext * ctx, struct gl_texture_object *texObj)
+  * Called via ctx->Driver.NewTextureObject.
+  * Note: this function will be called during context creation to
+  * allocate the default texture objects.
+- * Note: we could use containment here to 'derive' the driver-specific
+- * texture object from the core mesa gl_texture_object.  Not done at this time.
+  * Fixup MaxAnisotropy according to user preference.
+  */
+ static struct gl_texture_object *r300NewTextureObject(GLcontext * ctx,
+@@ -1017,14 +294,23 @@ static struct gl_texture_object *r300NewTextureObject(GLcontext * ctx,
+ 						      GLenum target)
+ {
+ 	r300ContextPtr rmesa = R300_CONTEXT(ctx);
+-	struct gl_texture_object *obj;
+-	obj = _mesa_new_texture_object(ctx, name, target);
+-	if (!obj)
+-		return NULL;
+-	obj->MaxAnisotropy = rmesa->initialMaxAnisotropy;
++	radeonTexObj* t = CALLOC_STRUCT(radeon_tex_obj);
++
+ 
+-	r300AllocTexObj(obj);
+-	return obj;
++	if (RADEON_DEBUG & (DEBUG_STATE | DEBUG_TEXTURE)) {
++		fprintf(stderr, "%s( %p (target = %s) )\n", __FUNCTION__,
++			t, _mesa_lookup_enum_by_nr(target));
++	}
++
++	_mesa_initialize_texture_object(&t->base, name, target);
++	t->base.MaxAnisotropy = rmesa->radeon.initialMaxAnisotropy;
++
++	/* Initialize hardware state */
++	r300UpdateTexWrap(t);
++	r300SetTexFilter(t, t->base.MinFilter, t->base.MagFilter, t->base.MaxAnisotropy);
++	r300SetTexBorderColor(t, t->base._BorderChan);
++
++	return &t->base;
+ }
+ 
+ void r300InitTextureFuncs(struct dd_function_table *functions)
+@@ -1032,22 +318,30 @@ void r300InitTextureFuncs(struct dd_function_table *functions)
+ 	/* Note: we only plug in the functions we implement in the driver
+ 	 * since _mesa_init_driver_functions() was already called.
+ 	 */
+-	functions->ChooseTextureFormat = r300ChooseTextureFormat;
+-	functions->TexImage1D = r300TexImage1D;
+-	functions->TexImage2D = r300TexImage2D;
+-	functions->TexImage3D = r300TexImage3D;
+-	functions->TexSubImage1D = r300TexSubImage1D;
+-	functions->TexSubImage2D = r300TexSubImage2D;
+-	functions->TexSubImage3D = r300TexSubImage3D;
++	functions->NewTextureImage = radeonNewTextureImage;
++	functions->FreeTexImageData = radeonFreeTexImageData;
++	functions->MapTexture = radeonMapTexture;
++	functions->UnmapTexture = radeonUnmapTexture;
++
++	functions->ChooseTextureFormat = radeonChooseTextureFormat;
++	functions->TexImage1D = radeonTexImage1D;
++	functions->TexImage2D = radeonTexImage2D;
++	functions->TexImage3D = radeonTexImage3D;
++	functions->TexSubImage1D = radeonTexSubImage1D;
++	functions->TexSubImage2D = radeonTexSubImage2D;
++	functions->TexSubImage3D = radeonTexSubImage3D;
++	functions->GetTexImage = radeonGetTexImage;
++	functions->GetCompressedTexImage = radeonGetCompressedTexImage;
+ 	functions->NewTextureObject = r300NewTextureObject;
+-	functions->BindTexture = r300BindTexture;
+ 	functions->DeleteTexture = r300DeleteTexture;
+ 	functions->IsTextureResident = driIsTextureResident;
+ 
+ 	functions->TexParameter = r300TexParameter;
+ 
+-	functions->CompressedTexImage2D = r300CompressedTexImage2D;
+-	functions->CompressedTexSubImage2D = r300CompressedTexSubImage2D;
++	functions->CompressedTexImage2D = radeonCompressedTexImage2D;
++	functions->CompressedTexSubImage2D = radeonCompressedTexSubImage2D;
++
++	functions->GenerateMipmap = radeonGenerateMipmap;
+ 
+ 	driInitTextureFormats();
+ }
+diff --git a/src/mesa/drivers/dri/r300/r300_tex.h b/src/mesa/drivers/dri/r300/r300_tex.h
+index b86d45b..baad3fe 100644
+--- a/src/mesa/drivers/dri/r300/r300_tex.h
++++ b/src/mesa/drivers/dri/r300/r300_tex.h
+@@ -37,16 +37,14 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ extern void r300SetDepthTexMode(struct gl_texture_object *tObj);
+ 
++extern void r300SetTexBuffer(__DRIcontext *pDRICtx, GLint target,
++			     __DRIdrawable *dPriv);
++
+ extern void r300SetTexOffset(__DRIcontext *pDRICtx, GLint texname,
+ 			     unsigned long long offset, GLint depth,
+ 			     GLuint pitch);
+ 
+-extern void r300UpdateTextureState(GLcontext * ctx);
+-
+-extern int r300UploadTexImages(r300ContextPtr rmesa, r300TexObjPtr t,
+-			       GLuint face);
+-
+-extern void r300DestroyTexObj(r300ContextPtr rmesa, r300TexObjPtr t);
++extern GLboolean r300ValidateBuffers(GLcontext * ctx);
+ 
+ extern void r300InitTextureFuncs(struct dd_function_table *functions);
+ 
+diff --git a/src/mesa/drivers/dri/r300/r300_texmem.c b/src/mesa/drivers/dri/r300/r300_texmem.c
+deleted file mode 100644
+index b03eefa..0000000
+--- a/src/mesa/drivers/dri/r300/r300_texmem.c
++++ /dev/null
+@@ -1,567 +0,0 @@
+-/**************************************************************************
+-
+-Copyright (C) Tungsten Graphics 2002.  All Rights Reserved.
+-The Weather Channel, Inc. funded Tungsten Graphics to develop the
+-initial release of the Radeon 8500 driver under the XFree86
+-license. This notice must be preserved.
+-
+-Permission is hereby granted, free of charge, to any person obtaining
+-a copy of this software and associated documentation files (the
+-"Software"), to deal in the Software without restriction, including
+-without limitation on the rights to use, copy, modify, merge, publish,
+-distribute, sub license, and/or sell copies of the Software, and to
+-permit persons to whom the Software is furnished to do so, subject to
+-the following conditions:
+-
+-The above copyright notice and this permission notice (including the
+-next paragraph) shall be included in all copies or substantial
+-portions of the Software.
+-
+-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+-EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+-MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+-NON-INFRINGEMENT. IN NO EVENT SHALL ATI, VA LINUX SYSTEMS AND/OR THEIR
+-SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
+-IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
+-IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+-SOFTWARE.
+-
+-**************************************************************************/
+-
+-/**
+- * \file
+- *
+- * \author Gareth Hughes <gareth@valinux.com>
+- *
+- * \author Kevin E. Martin <martin@valinux.com>
+- */
+-
+-#include <errno.h>
+-
+-#include "main/glheader.h"
+-#include "main/imports.h"
+-#include "main/context.h"
+-#include "main/colormac.h"
+-#include "main/macros.h"
+-#include "main/simple_list.h"
+-#include "radeon_reg.h"		/* gets definition for usleep */
+-#include "r300_context.h"
+-#include "r300_state.h"
+-#include "r300_cmdbuf.h"
+-#include "radeon_ioctl.h"
+-#include "r300_tex.h"
+-#include "r300_ioctl.h"
+-#include <unistd.h>		/* for usleep() */
+-
+-#ifdef USER_BUFFERS
+-#include "r300_mem.h"
+-#endif
+-
+-/**
+- * Destroy any device-dependent state associated with the texture.  This may
+- * include NULLing out hardware state that points to the texture.
+- */
+-void r300DestroyTexObj(r300ContextPtr rmesa, r300TexObjPtr t)
+-{
+-	int i;
+-
+-	if (RADEON_DEBUG & DEBUG_TEXTURE) {
+-		fprintf(stderr, "%s( %p, %p )\n", __FUNCTION__,
+-			(void *)t, (void *)t->base.tObj);
+-	}
+-
+-	for (i = 0; i < rmesa->radeon.glCtx->Const.MaxTextureUnits; i++) {
+-		if (rmesa->state.texture.unit[i].texobj == t) {
+-			rmesa->state.texture.unit[i].texobj = NULL;
+-		}
+-	}
+-}
+-
+-/* ------------------------------------------------------------
+- * Texture image conversions
+- */
+-
+-static void r300UploadGARTClientSubImage(r300ContextPtr rmesa,
+-					 r300TexObjPtr t,
+-					 struct gl_texture_image *texImage,
+-					 GLint hwlevel,
+-					 GLint x, GLint y,
+-					 GLint width, GLint height)
+-{
+-	const struct gl_texture_format *texFormat = texImage->TexFormat;
+-	GLuint srcPitch, dstPitch;
+-	int blit_format;
+-	int srcOffset;
+-
+-	/*
+-	 * XXX it appears that we always upload the full image, not a subimage.
+-	 * I.e. x==0, y==0, width=texWidth, height=texWidth.  If this is ever
+-	 * changed, the src pitch will have to change.
+-	 */
+-	switch (texFormat->TexelBytes) {
+-	case 1:
+-		blit_format = R300_CP_COLOR_FORMAT_CI8;
+-		srcPitch = t->image[0][0].width * texFormat->TexelBytes;
+-		dstPitch = t->image[0][0].width * texFormat->TexelBytes;
+-		break;
+-	case 2:
+-		blit_format = R300_CP_COLOR_FORMAT_RGB565;
+-		srcPitch = t->image[0][0].width * texFormat->TexelBytes;
+-		dstPitch = t->image[0][0].width * texFormat->TexelBytes;
+-		break;
+-	case 4:
+-		blit_format = R300_CP_COLOR_FORMAT_ARGB8888;
+-		srcPitch = t->image[0][0].width * texFormat->TexelBytes;
+-		dstPitch = t->image[0][0].width * texFormat->TexelBytes;
+-		break;
+-	case 8:
+-	case 16:
+-		blit_format = R300_CP_COLOR_FORMAT_CI8;
+-		srcPitch = t->image[0][0].width * texFormat->TexelBytes;
+-		dstPitch = t->image[0][0].width * texFormat->TexelBytes;
+-		break;
+-	default:
+-		return;
+-	}
+-
+-	t->image[0][hwlevel].data = texImage->Data;
+-	srcOffset = r300GartOffsetFromVirtual(rmesa, texImage->Data);
+-
+-	assert(srcOffset != ~0);
+-
+-	/* Don't currently need to cope with small pitches?
+-	 */
+-	width = texImage->Width;
+-	height = texImage->Height;
+-
+-	if (texFormat->TexelBytes > 4) {
+-		width *= texFormat->TexelBytes;
+-	}
+-
+-	r300EmitWait(rmesa, R300_WAIT_3D);
+-
+-	r300EmitBlit(rmesa, blit_format,
+-		     srcPitch,
+-		     srcOffset,
+-		     dstPitch,
+-		     t->bufAddr,
+-		     x,
+-		     y,
+-		     t->image[0][hwlevel].x + x,
+-		     t->image[0][hwlevel].y + y, width, height);
+-
+-	r300EmitWait(rmesa, R300_WAIT_2D);
+-}
+-
+-static void r300UploadRectSubImage(r300ContextPtr rmesa,
+-				   r300TexObjPtr t,
+-				   struct gl_texture_image *texImage,
+-				   GLint x, GLint y, GLint width, GLint height)
+-{
+-	const struct gl_texture_format *texFormat = texImage->TexFormat;
+-	int blit_format, dstPitch, done;
+-
+-	switch (texFormat->TexelBytes) {
+-	case 1:
+-		blit_format = R300_CP_COLOR_FORMAT_CI8;
+-		break;
+-	case 2:
+-		blit_format = R300_CP_COLOR_FORMAT_RGB565;
+-		break;
+-	case 4:
+-		blit_format = R300_CP_COLOR_FORMAT_ARGB8888;
+-		break;
+-	case 8:
+-	case 16:
+-		blit_format = R300_CP_COLOR_FORMAT_CI8;
+-		break;
+-	default:
+-		return;
+-	}
+-
+-	t->image[0][0].data = texImage->Data;
+-
+-	/* Currently don't need to cope with small pitches.
+-	 */
+-	width = texImage->Width;
+-	height = texImage->Height;
+-	dstPitch = t->pitch;
+-
+-	if (texFormat->TexelBytes > 4) {
+-		width *= texFormat->TexelBytes;
+-	}
+-
+-	if (rmesa->prefer_gart_client_texturing && texImage->IsClientData) {
+-		/* In this case, could also use GART texturing.  This is
+-		 * currently disabled, but has been tested & works.
+-		 */
+-		t->offset = r300GartOffsetFromVirtual(rmesa, texImage->Data);
+-		t->pitch = texImage->RowStride * texFormat->TexelBytes - 32;
+-
+-		if (RADEON_DEBUG & DEBUG_TEXTURE)
+-			fprintf(stderr,
+-				"Using GART texturing for rectangular client texture\n");
+-
+-		/* Release FB memory allocated for this image:
+-		 */
+-		/* FIXME This may not be correct as driSwapOutTextureObject sets
+-		 * FIXME dirty_images.  It may be fine, though.
+-		 */
+-		if (t->base.memBlock) {
+-			driSwapOutTextureObject((driTextureObject *) t);
+-		}
+-	} else if (texImage->IsClientData) {
+-		/* Data already in GART memory, with usable pitch.
+-		 */
+-		GLuint srcPitch;
+-		srcPitch = texImage->RowStride * texFormat->TexelBytes;
+-		r300EmitBlit(rmesa,
+-			     blit_format,
+-			     srcPitch,
+-			     r300GartOffsetFromVirtual(rmesa, texImage->Data),
+-			     dstPitch, t->bufAddr, 0, 0, 0, 0, width, height);
+-	} else {
+-		/* Data not in GART memory, or bad pitch.
+-		 */
+-		for (done = 0; done < height;) {
+-			struct r300_dma_region region;
+-			int lines =
+-			    MIN2(height - done, RADEON_BUFFER_SIZE / dstPitch);
+-			int src_pitch;
+-			char *tex;
+-
+-			src_pitch = texImage->RowStride * texFormat->TexelBytes;
+-
+-			tex = (char *)texImage->Data + done * src_pitch;
+-
+-			memset(&region, 0, sizeof(region));
+-			r300AllocDmaRegion(rmesa, &region, lines * dstPitch,
+-					   1024);
+-
+-			/* Copy texdata to dma:
+-			 */
+-			if (RADEON_DEBUG & DEBUG_TEXTURE)
+-				fprintf(stderr,
+-					"%s: src_pitch %d dst_pitch %d\n",
+-					__FUNCTION__, src_pitch, dstPitch);
+-
+-			if (src_pitch == dstPitch) {
+-				memcpy(region.address + region.start, tex,
+-				       lines * src_pitch);
+-			} else {
+-				char *buf = region.address + region.start;
+-				int i;
+-				for (i = 0; i < lines; i++) {
+-					memcpy(buf, tex, src_pitch);
+-					buf += dstPitch;
+-					tex += src_pitch;
+-				}
+-			}
+-
+-			r300EmitWait(rmesa, R300_WAIT_3D);
+-
+-			/* Blit to framebuffer
+-			 */
+-			r300EmitBlit(rmesa,
+-				     blit_format,
+-				     dstPitch, GET_START(&region),
+-				     dstPitch | (t->tile_bits >> 16),
+-				     t->bufAddr, 0, 0, 0, done, width, lines);
+-
+-			r300EmitWait(rmesa, R300_WAIT_2D);
+-#ifdef USER_BUFFERS
+-			r300_mem_use(rmesa, region.buf->id);
+-#endif
+-
+-			r300ReleaseDmaRegion(rmesa, &region, __FUNCTION__);
+-			done += lines;
+-		}
+-	}
+-}
+-
+-/**
+- * Upload the texture image associated with texture \a t at the specified
+- * level at the address relative to \a start.
+- */
+-static void r300UploadSubImage(r300ContextPtr rmesa, r300TexObjPtr t,
+-			       GLint hwlevel,
+-			       GLint x, GLint y, GLint width, GLint height,
+-			       GLuint face)
+-{
+-	struct gl_texture_image *texImage = NULL;
+-	GLuint offset;
+-	GLint imageWidth, imageHeight;
+-	GLint ret;
+-	drm_radeon_texture_t tex;
+-	drm_radeon_tex_image_t tmp;
+-	const int level = hwlevel + t->base.firstLevel;
+-
+-	if (RADEON_DEBUG & DEBUG_TEXTURE) {
+-		fprintf(stderr,
+-			"%s( %p, %p ) level/width/height/face = %d/%d/%d/%u\n",
+-			__FUNCTION__, (void *)t, (void *)t->base.tObj, level,
+-			width, height, face);
+-	}
+-
+-	ASSERT(face < 6);
+-
+-	/* Ensure we have a valid texture to upload */
+-	if ((hwlevel < 0) || (hwlevel >= RADEON_MAX_TEXTURE_LEVELS)) {
+-		_mesa_problem(NULL, "bad texture level in %s", __FUNCTION__);
+-		return;
+-	}
+-
+-	texImage = t->base.tObj->Image[face][level];
+-
+-	if (!texImage) {
+-		if (RADEON_DEBUG & DEBUG_TEXTURE)
+-			fprintf(stderr, "%s: texImage %d is NULL!\n",
+-				__FUNCTION__, level);
+-		return;
+-	}
+-	if (!texImage->Data) {
+-		if (RADEON_DEBUG & DEBUG_TEXTURE)
+-			fprintf(stderr, "%s: image data is NULL!\n",
+-				__FUNCTION__);
+-		return;
+-	}
+-
+-	if (t->base.tObj->Target == GL_TEXTURE_RECTANGLE_NV) {
+-		assert(level == 0);
+-		assert(hwlevel == 0);
+-		if (RADEON_DEBUG & DEBUG_TEXTURE)
+-			fprintf(stderr, "%s: image data is rectangular\n",
+-				__FUNCTION__);
+-		r300UploadRectSubImage(rmesa, t, texImage, x, y, width, height);
+-		return;
+-	} else if (texImage->IsClientData) {
+-		if (RADEON_DEBUG & DEBUG_TEXTURE)
+-			fprintf(stderr,
+-				"%s: image data is in GART client storage\n",
+-				__FUNCTION__);
+-		r300UploadGARTClientSubImage(rmesa, t, texImage, hwlevel, x, y,
+-					     width, height);
+-		return;
+-	} else if (RADEON_DEBUG & DEBUG_TEXTURE)
+-		fprintf(stderr, "%s: image data is in normal memory\n",
+-			__FUNCTION__);
+-
+-	imageWidth = texImage->Width;
+-	imageHeight = texImage->Height;
+-
+-	offset = t->bufAddr;
+-
+-	if (RADEON_DEBUG & (DEBUG_TEXTURE | DEBUG_IOCTL)) {
+-		GLint imageX = 0;
+-		GLint imageY = 0;
+-		GLint blitX = t->image[face][hwlevel].x;
+-		GLint blitY = t->image[face][hwlevel].y;
+-		GLint blitWidth = t->image[face][hwlevel].width;
+-		GLint blitHeight = t->image[face][hwlevel].height;
+-		fprintf(stderr, "   upload image: %d,%d at %d,%d\n",
+-			imageWidth, imageHeight, imageX, imageY);
+-		fprintf(stderr, "   upload  blit: %d,%d at %d,%d\n",
+-			blitWidth, blitHeight, blitX, blitY);
+-		fprintf(stderr, "       blit ofs: 0x%07x level: %d/%d\n",
+-			(GLuint) offset, hwlevel, level);
+-	}
+-
+-	t->image[face][hwlevel].data = texImage->Data;
+-
+-	/* Init the DRM_RADEON_TEXTURE command / drm_radeon_texture_t struct.
+-	 * NOTE: we're always use a 1KB-wide blit and I8 texture format.
+-	 * We used to use 1, 2 and 4-byte texels and used to use the texture
+-	 * width to dictate the blit width - but that won't work for compressed
+-	 * textures. (Brian)
+-	 * NOTE: can't do that with texture tiling. (sroland)
+-	 */
+-	tex.offset = offset;
+-	tex.image = &tmp;
+-	/* copy (x,y,width,height,data) */
+-	memcpy(&tmp, &t->image[face][hwlevel], sizeof(tmp));
+-
+-	if (texImage->TexFormat->TexelBytes > 4) {
+-		const int log2TexelBytes =
+-		    (3 + (texImage->TexFormat->TexelBytes >> 4));
+-		tex.format = RADEON_TXFORMAT_I8;	/* any 1-byte texel format */
+-		tex.pitch =
+-		    MAX2((texImage->Width * texImage->TexFormat->TexelBytes) /
+-			 64, 1);
+-		tex.height = imageHeight;
+-		tex.width = imageWidth << log2TexelBytes;
+-		tex.offset += (tmp.x << log2TexelBytes) & ~1023;
+-		tmp.x = tmp.x % (1024 >> log2TexelBytes);
+-		tmp.width = tmp.width << log2TexelBytes;
+-	} else if (texImage->TexFormat->TexelBytes) {
+-		/* use multi-byte upload scheme */
+-		tex.height = imageHeight;
+-		tex.width = imageWidth;
+-		switch (texImage->TexFormat->TexelBytes) {
+-		case 1:
+-			tex.format = RADEON_TXFORMAT_I8;
+-			break;
+-		case 2:
+-			tex.format = RADEON_TXFORMAT_AI88;
+-			break;
+-		case 4:
+-			tex.format = RADEON_TXFORMAT_ARGB8888;
+-			break;
+-		}
+-		tex.pitch =
+-		    MAX2((texImage->Width * texImage->TexFormat->TexelBytes) /
+-			 64, 1);
+-		tex.offset += tmp.x & ~1023;
+-		tmp.x = tmp.x % 1024;
+-
+-		if (t->tile_bits & R300_TXO_MICRO_TILE) {
+-			/* need something like "tiled coordinates" ? */
+-			tmp.y = tmp.x / (tex.pitch * 128) * 2;
+-			tmp.x =
+-			    tmp.x % (tex.pitch * 128) / 2 /
+-			    texImage->TexFormat->TexelBytes;
+-			tex.pitch |= RADEON_DST_TILE_MICRO >> 22;
+-		} else {
+-			tmp.x = tmp.x >> (texImage->TexFormat->TexelBytes >> 1);
+-		}
+-#if 1
+-		if ((t->tile_bits & R300_TXO_MACRO_TILE) &&
+-		    (texImage->Width * texImage->TexFormat->TexelBytes >= 256)
+-		    && ((!(t->tile_bits & R300_TXO_MICRO_TILE)
+-			 && (texImage->Height >= 8))
+-			|| (texImage->Height >= 16))) {
+-			/* weird: R200 disables macro tiling if mip width is smaller than 256 bytes,
+-			   OR if height is smaller than 8 automatically, but if micro tiling is active
+-			   the limit is height 16 instead ? */
+-			tex.pitch |= RADEON_DST_TILE_MACRO >> 22;
+-		}
+-#endif
+-	} else {
+-		/* In case of for instance 8x8 texture (2x2 dxt blocks),
+-		   padding after the first two blocks is needed (only
+-		   with dxt1 since 2 dxt3/dxt5 blocks already use 32 Byte). */
+-		/* set tex.height to 1/4 since 1 "macropixel" (dxt-block)
+-		   has 4 real pixels. Needed so the kernel module reads
+-		   the right amount of data. */
+-		tex.format = RADEON_TXFORMAT_I8;	/* any 1-byte texel format */
+-		tex.pitch = (R300_BLIT_WIDTH_BYTES / 64);
+-		tex.height = (imageHeight + 3) / 4;
+-		tex.width = (imageWidth + 3) / 4;
+-		if ((t->format & R300_TX_FORMAT_DXT1) == R300_TX_FORMAT_DXT1) {
+-			tex.width *= 8;
+-		} else {
+-			tex.width *= 16;
+-		}
+-	}
+-
+-	LOCK_HARDWARE(&rmesa->radeon);
+-	do {
+-		ret =
+-		    drmCommandWriteRead(rmesa->radeon.dri.fd,
+-					DRM_RADEON_TEXTURE, &tex,
+-					sizeof(drm_radeon_texture_t));
+-		if (ret) {
+-			if (RADEON_DEBUG & DEBUG_IOCTL)
+-				fprintf(stderr,
+-					"DRM_RADEON_TEXTURE:  again!\n");
+-			usleep(1);
+-		}
+-	} while (ret == -EAGAIN);
+-
+-	UNLOCK_HARDWARE(&rmesa->radeon);
+-
+-	if (ret) {
+-		fprintf(stderr, "DRM_RADEON_TEXTURE: return = %d\n", ret);
+-		fprintf(stderr, "   offset=0x%08x\n", offset);
+-		fprintf(stderr, "   image width=%d height=%d\n",
+-			imageWidth, imageHeight);
+-		fprintf(stderr, "    blit width=%d height=%d data=%p\n",
+-			t->image[face][hwlevel].width,
+-			t->image[face][hwlevel].height,
+-			t->image[face][hwlevel].data);
+-		_mesa_exit(-1);
+-	}
+-}
+-
+-/**
+- * Upload the texture images associated with texture \a t.  This might
+- * require the allocation of texture memory.
+- *
+- * \param rmesa Context pointer
+- * \param t Texture to be uploaded
+- * \param face Cube map face to be uploaded.  Zero for non-cube maps.
+- */
+-
+-int r300UploadTexImages(r300ContextPtr rmesa, r300TexObjPtr t, GLuint face)
+-{
+-	const int numLevels = t->base.lastLevel - t->base.firstLevel + 1;
+-
+-	if (t->image_override)
+-		return 0;
+-
+-	if (RADEON_DEBUG & (DEBUG_TEXTURE | DEBUG_IOCTL)) {
+-		fprintf(stderr, "%s( %p, %p ) sz=%d lvls=%d-%d\n", __FUNCTION__,
+-			(void *)rmesa->radeon.glCtx, (void *)t->base.tObj,
+-			t->base.totalSize, t->base.firstLevel,
+-			t->base.lastLevel);
+-	}
+-
+-	if (t->base.totalSize == 0)
+-		return 0;
+-
+-	if (RADEON_DEBUG & DEBUG_SYNC) {
+-		fprintf(stderr, "%s: Syncing\n", __FUNCTION__);
+-		radeonFinish(rmesa->radeon.glCtx);
+-	}
+-
+-	LOCK_HARDWARE(&rmesa->radeon);
+-
+-	if (t->base.memBlock == NULL) {
+-		int heap;
+-
+-		heap = driAllocateTexture(rmesa->texture_heaps, rmesa->nr_heaps,
+-					  (driTextureObject *) t);
+-		if (heap == -1) {
+-			UNLOCK_HARDWARE(&rmesa->radeon);
+-			return -1;
+-		}
+-
+-		/* Set the base offset of the texture image */
+-		t->bufAddr = rmesa->radeon.radeonScreen->texOffset[heap]
+-		    + t->base.memBlock->ofs;
+-		t->offset = t->bufAddr;
+-
+-		if (!(t->base.tObj->Image[0][0]->IsClientData)) {
+-			/* hope it's safe to add that here... */
+-			t->offset |= t->tile_bits;
+-		}
+-	}
+-
+-	/* Let the world know we've used this memory recently.
+-	 */
+-	driUpdateTextureLRU((driTextureObject *) t);
+-	UNLOCK_HARDWARE(&rmesa->radeon);
+-
+-	/* Upload any images that are new */
+-	if (t->base.dirty_images[face]) {
+-		int i;
+-		for (i = 0; i < numLevels; i++) {
+-			if ((t->base.
+-			     dirty_images[face] & (1 <<
+-						   (i + t->base.firstLevel))) !=
+-			    0) {
+-				r300UploadSubImage(rmesa, t, i, 0, 0,
+-						   t->image[face][i].width,
+-						   t->image[face][i].height,
+-						   face);
+-			}
+-		}
+-		t->base.dirty_images[face] = 0;
+-	}
+-
+-	if (RADEON_DEBUG & DEBUG_SYNC) {
+-		fprintf(stderr, "%s: Syncing\n", __FUNCTION__);
+-		radeonFinish(rmesa->radeon.glCtx);
+-	}
+-
+-	return 0;
+-}
+diff --git a/src/mesa/drivers/dri/r300/r300_texstate.c b/src/mesa/drivers/dri/r300/r300_texstate.c
+index e2329f0..8a90069 100644
+--- a/src/mesa/drivers/dri/r300/r300_texstate.c
++++ b/src/mesa/drivers/dri/r300/r300_texstate.c
+@@ -47,7 +47,7 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ #include "r300_context.h"
+ #include "r300_state.h"
+ #include "r300_ioctl.h"
+-#include "radeon_ioctl.h"
++#include "radeon_mipmap_tree.h"
+ #include "r300_tex.h"
+ #include "r300_reg.h"
+ 
+@@ -143,13 +143,12 @@ void r300SetDepthTexMode(struct gl_texture_object *tObj)
+ 		},
+ 	};
+ 	const GLuint *format;
+-	r300TexObjPtr t;
++	radeonTexObjPtr t;
+ 
+ 	if (!tObj)
+ 		return;
+ 
+-	t = (r300TexObjPtr) tObj->DriverData;
+-
++	t = radeon_tex_obj(tObj);
+ 
+ 	switch (tObj->Image[0][tObj->BaseLevel]->TexFormat->MesaFormat) {
+ 	case MESA_FORMAT_Z16:
+@@ -171,13 +170,13 @@ void r300SetDepthTexMode(struct gl_texture_object *tObj)
+ 
+ 	switch (tObj->DepthMode) {
+ 	case GL_LUMINANCE:
+-		t->format = format[0];
++		t->pp_txformat = format[0];
+ 		break;
+ 	case GL_INTENSITY:
+-		t->format = format[1];
++		t->pp_txformat = format[1];
+ 		break;
+ 	case GL_ALPHA:
+-		t->format = format[2];
++		t->pp_txformat = format[2];
+ 		break;
+ 	default:
+ 		/* Error...which should have already been caught by higher
+@@ -190,479 +189,309 @@ void r300SetDepthTexMode(struct gl_texture_object *tObj)
+ 
+ 
+ /**
+- * Compute sizes and fill in offset and blit information for the given
+- * image (determined by \p face and \p level).
+- *
+- * \param curOffset points to the offset at which the image is to be stored
+- * and is updated by this function according to the size of the image.
+- */
+-static void compute_tex_image_offset(
+-	struct gl_texture_object *tObj,
+-	GLuint face,
+-	GLint level,
+-	GLint* curOffset)
+-{
+-	r300TexObjPtr t = (r300TexObjPtr) tObj->DriverData;
+-	const struct gl_texture_image* texImage;
+-	GLuint blitWidth = R300_BLIT_WIDTH_BYTES;
+-	GLuint texelBytes;
+-	GLuint size;
+-
+-	texImage = tObj->Image[0][level + t->base.firstLevel];
+-	if (!texImage)
+-		return;
+-
+-	texelBytes = texImage->TexFormat->TexelBytes;
+-
+-	/* find image size in bytes */
+-	if (texImage->IsCompressed) {
+-		if ((t->format & R300_TX_FORMAT_DXT1) ==
+-			R300_TX_FORMAT_DXT1) {
+-			// fprintf(stderr,"DXT 1 %d %08X\n", texImage->Width, t->format);
+-			if ((texImage->Width + 3) < 8)	/* width one block */
+-				size = texImage->CompressedSize * 4;
+-			else if ((texImage->Width + 3) < 16)
+-				size = texImage->CompressedSize * 2;
+-			else
+-				size = texImage->CompressedSize;
+-		} else {
+-			/* DXT3/5, 16 bytes per block */
+-			WARN_ONCE
+-				("DXT 3/5 suffers from multitexturing problems!\n");
+-			// fprintf(stderr,"DXT 3/5 %d\n", texImage->Width);
+-			if ((texImage->Width + 3) < 8)
+-				size = texImage->CompressedSize * 2;
+-			else
+-				size = texImage->CompressedSize;
+-		}
+-	} else if (tObj->Target == GL_TEXTURE_RECTANGLE_NV) {
+-		size =
+-			((texImage->Width * texelBytes +
+-			63) & ~63) * texImage->Height;
+-		blitWidth = 64 / texelBytes;
+-	} else if (t->tile_bits & R300_TXO_MICRO_TILE) {
+-		/* tile pattern is 16 bytes x2. mipmaps stay 32 byte aligned,
+-			though the actual offset may be different (if texture is less than
+-			32 bytes width) to the untiled case */
+-		int w = (texImage->Width * texelBytes * 2 + 31) & ~31;
+-		size =
+-			(w * ((texImage->Height + 1) / 2)) *
+-			texImage->Depth;
+-		blitWidth = MAX2(texImage->Width, 64 / texelBytes);
+-	} else {
+-		int w = (texImage->Width * texelBytes + 31) & ~31;
+-		size = w * texImage->Height * texImage->Depth;
+-		blitWidth = MAX2(texImage->Width, 64 / texelBytes);
+-	}
+-	assert(size > 0);
+-
+-	if (RADEON_DEBUG & DEBUG_TEXTURE)
+-		fprintf(stderr, "w=%d h=%d d=%d tb=%d intFormat=%d\n",
+-			texImage->Width, texImage->Height,
+-			texImage->Depth,
+-			texImage->TexFormat->TexelBytes,
+-			texImage->InternalFormat);
+-
+-	/* All images are aligned to a 32-byte offset */
+-	*curOffset = (*curOffset + 0x1f) & ~0x1f;
+-
+-	if (texelBytes) {
+-		/* fix x and y coords up later together with offset */
+-		t->image[face][level].x = *curOffset;
+-		t->image[face][level].y = 0;
+-		t->image[face][level].width =
+-			MIN2(size / texelBytes, blitWidth);
+-		t->image[face][level].height =
+-			(size / texelBytes) / t->image[face][level].width;
+-	} else {
+-		t->image[face][level].x = *curOffset % R300_BLIT_WIDTH_BYTES;
+-		t->image[face][level].y = *curOffset / R300_BLIT_WIDTH_BYTES;
+-		t->image[face][level].width =
+-			MIN2(size, R300_BLIT_WIDTH_BYTES);
+-		t->image[face][level].height = size / t->image[face][level].width;
+-	}
+-
+-	if (RADEON_DEBUG & DEBUG_TEXTURE)
+-		fprintf(stderr,
+-			"level %d, face %d: %dx%d x=%d y=%d w=%d h=%d size=%d at %d\n",
+-			level, face, texImage->Width, texImage->Height,
+-			t->image[face][level].x, t->image[face][level].y,
+-			t->image[face][level].width, t->image[face][level].height,
+-			size, *curOffset);
+-
+-	*curOffset += size;
+-}
+-
+-
+-
+-/**
+- * This function computes the number of bytes of storage needed for
+- * the given texture object (all mipmap levels, all cube faces).
+- * The \c image[face][level].x/y/width/height parameters for upload/blitting
+- * are computed here.  \c filter, \c format, etc. will be set here
+- * too.
++ * Compute the cached hardware register values for the given texture object.
+  *
+  * \param rmesa Context pointer
+- * \param tObj GL texture object whose images are to be posted to
+- *                 hardware state.
++ * \param t the r300 texture object
+  */
+-static void r300SetTexImages(r300ContextPtr rmesa,
+-			     struct gl_texture_object *tObj)
++static void setup_hardware_state(r300ContextPtr rmesa, radeonTexObj *t)
+ {
+-	r300TexObjPtr t = (r300TexObjPtr) tObj->DriverData;
+-	const struct gl_texture_image *baseImage =
+-	    tObj->Image[0][tObj->BaseLevel];
+-	GLint curOffset;
+-	GLint i, texelBytes;
+-	GLint numLevels;
+-	GLint log2Width, log2Height, log2Depth;
+-
+-	/* Set the hardware texture format
+-	 */
++	const struct gl_texture_image *firstImage;
++	int firstlevel = t->mt ? t->mt->firstLevel : 0;
++	    
++	firstImage = t->base.Image[0][firstlevel];
++
+ 	if (!t->image_override
+-	    && VALID_FORMAT(baseImage->TexFormat->MesaFormat)) {
+-		if (baseImage->TexFormat->BaseFormat == GL_DEPTH_COMPONENT) {
+-			r300SetDepthTexMode(tObj);
++	    && VALID_FORMAT(firstImage->TexFormat->MesaFormat)) {
++		if (firstImage->TexFormat->BaseFormat == GL_DEPTH_COMPONENT) {
++			r300SetDepthTexMode(&t->base);
+ 		} else {
+-			t->format = tx_table[baseImage->TexFormat->MesaFormat].format;
++			t->pp_txformat = tx_table[firstImage->TexFormat->MesaFormat].format;
+ 		}
+ 
+-		t->filter |= tx_table[baseImage->TexFormat->MesaFormat].filter;
++		t->pp_txfilter |= tx_table[firstImage->TexFormat->MesaFormat].filter;
+ 	} else if (!t->image_override) {
+ 		_mesa_problem(NULL, "unexpected texture format in %s",
+ 			      __FUNCTION__);
+ 		return;
+ 	}
+ 
+-	texelBytes = baseImage->TexFormat->TexelBytes;
+-
+-	/* Compute which mipmap levels we really want to send to the hardware.
+-	 */
+-	driCalculateTextureFirstLastLevel((driTextureObject *) t);
+-	log2Width = tObj->Image[0][t->base.firstLevel]->WidthLog2;
+-	log2Height = tObj->Image[0][t->base.firstLevel]->HeightLog2;
+-	log2Depth = tObj->Image[0][t->base.firstLevel]->DepthLog2;
+-
+-	numLevels = t->base.lastLevel - t->base.firstLevel + 1;
++	if (t->image_override && t->bo)
++		return;
+ 
+-	assert(numLevels <= RADEON_MAX_TEXTURE_LEVELS);
++	t->pp_txsize = (((firstImage->Width - 1) << R300_TX_WIDTHMASK_SHIFT)
++			| ((firstImage->Height - 1) << R300_TX_HEIGHTMASK_SHIFT)
++			| ((firstImage->DepthLog2) << R300_TX_DEPTHMASK_SHIFT)
++			| ((t->mt->lastLevel - t->mt->firstLevel) << R300_TX_MAX_MIP_LEVEL_SHIFT));
+ 
+-	/* Calculate mipmap offsets and dimensions for blitting (uploading)
+-	 * The idea is that we lay out the mipmap levels within a block of
+-	 * memory organized as a rectangle of width BLIT_WIDTH_BYTES.
+-	 */
+ 	t->tile_bits = 0;
+ 
+-	/* figure out if this texture is suitable for tiling. */
+-#if 0				/* Disabled for now */
+-	if (texelBytes) {
+-		if ((tObj->Target != GL_TEXTURE_RECTANGLE_NV) &&
+-		    /* texrect might be able to use micro tiling too in theory? */
+-		    (baseImage->Height > 1)) {
+-
+-			/* allow 32 (bytes) x 1 mip (which will use two times the space
+-			   the non-tiled version would use) max if base texture is large enough */
+-			if ((numLevels == 1) ||
+-			    (((baseImage->Width * texelBytes /
+-			       baseImage->Height) <= 32)
+-			     && (baseImage->Width * texelBytes > 64))
+-			    ||
+-			    ((baseImage->Width * texelBytes /
+-			      baseImage->Height) <= 16)) {
+-				t->tile_bits |= R300_TXO_MICRO_TILE;
+-			}
+-		}
++	if (t->base.Target == GL_TEXTURE_CUBE_MAP)
++		t->pp_txformat |= R300_TX_FORMAT_CUBIC_MAP;
++	if (t->base.Target == GL_TEXTURE_3D)
++		t->pp_txformat |= R300_TX_FORMAT_3D;
+ 
+-		if (tObj->Target != GL_TEXTURE_RECTANGLE_NV) {
+-			/* we can set macro tiling even for small textures, they will be untiled anyway */
+-			t->tile_bits |= R300_TXO_MACRO_TILE;
+-		}
+-	}
+-#endif
+ 
+-	curOffset = 0;
+-
+-	if (tObj->Target == GL_TEXTURE_CUBE_MAP) {
+-		ASSERT(log2Width == log2Height);
+-		t->format |= R300_TX_FORMAT_CUBIC_MAP;
+-
+-		for(i = 0; i < numLevels; i++) {
+-			GLuint face;
+-			for(face = 0; face < 6; face++)
+-				compute_tex_image_offset(tObj, face, i, &curOffset);
+-		}
+-	} else {
+-		if (tObj->Target == GL_TEXTURE_3D)
+-                	t->format |= R300_TX_FORMAT_3D;
+-
+-		for (i = 0; i < numLevels; i++)
+-			compute_tex_image_offset(tObj, 0, i, &curOffset);
+-	}
+-
+-	/* Align the total size of texture memory block.
+-	 */
+-	t->base.totalSize =
+-	    (curOffset + RADEON_OFFSET_MASK) & ~RADEON_OFFSET_MASK;
+-
+-	t->size =
+-	    (((tObj->Image[0][t->base.firstLevel]->Width -
+-	       1) << R300_TX_WIDTHMASK_SHIFT)
+-	     | ((tObj->Image[0][t->base.firstLevel]->Height - 1) <<
+-		R300_TX_HEIGHTMASK_SHIFT)
+-	     | ((tObj->Image[0][t->base.firstLevel]->DepthLog2) <<
+-		R300_TX_DEPTHMASK_SHIFT))
+-	    | ((numLevels - 1) << R300_TX_MAX_MIP_LEVEL_SHIFT);
+-
+-	t->pitch = 0;
+-
+-	/* Only need to round to nearest 32 for textures, but the blitter
+-	 * requires 64-byte aligned pitches, and we may/may not need the
+-	 * blitter.   NPOT only!
+-	 */
+-	if (baseImage->IsCompressed) {
+-		t->pitch |=
+-		    (tObj->Image[0][t->base.firstLevel]->Width + 63) & ~(63);
+-	} else if (tObj->Target == GL_TEXTURE_RECTANGLE_NV) {
+-		unsigned int align = (64 / texelBytes) - 1;
+-		t->pitch |= ((tObj->Image[0][t->base.firstLevel]->Width *
+-			     texelBytes) + 63) & ~(63);
+-		t->size |= R300_TX_SIZE_TXPITCH_EN;
++	if (t->base.Target == GL_TEXTURE_RECTANGLE_NV) {
++		unsigned int align = (64 / t->mt->bpp) - 1;
++		t->pp_txsize |= R300_TX_SIZE_TXPITCH_EN;
+ 		if (!t->image_override)
+-			t->pitch_reg =
+-			    (((tObj->Image[0][t->base.firstLevel]->Width) +
+-			      align) & ~align) - 1;
+-	} else {
+-		t->pitch |=
+-		    ((tObj->Image[0][t->base.firstLevel]->Width *
+-		      texelBytes) + 63) & ~(63);
++			t->pp_txpitch = ((firstImage->Width + align) & ~align) - 1;
+ 	}
+ 
+ 	if (rmesa->radeon.radeonScreen->chip_family >= CHIP_FAMILY_RV515) {
+-	    if (tObj->Image[0][t->base.firstLevel]->Width > 2048)
+-		t->pitch_reg |= R500_TXWIDTH_BIT11;
+-	    if (tObj->Image[0][t->base.firstLevel]->Height > 2048)
+-		t->pitch_reg |= R500_TXHEIGHT_BIT11;
++	    if (firstImage->Width > 2048)
++		t->pp_txpitch |= R500_TXWIDTH_BIT11;
++	    if (firstImage->Height > 2048)
++		t->pp_txpitch |= R500_TXHEIGHT_BIT11;
+ 	}
+ }
+ 
+-/* ================================================================
+- * Texture unit state management
++/**
++ * Ensure the given texture is ready for rendering.
++ *
++ * Mostly this means populating the texture object's mipmap tree.
+  */
+-
+-static GLboolean r300EnableTexture2D(GLcontext * ctx, int unit)
++static GLboolean r300_validate_texture(GLcontext * ctx, struct gl_texture_object *texObj)
+ {
+ 	r300ContextPtr rmesa = R300_CONTEXT(ctx);
+-	struct gl_texture_unit *texUnit = &ctx->Texture.Unit[unit];
+-	struct gl_texture_object *tObj = texUnit->_Current;
+-	r300TexObjPtr t = (r300TexObjPtr) tObj->DriverData;
+-
+-	ASSERT(tObj->Target == GL_TEXTURE_2D || tObj->Target == GL_TEXTURE_1D);
++	radeonTexObj *t = radeon_tex_obj(texObj);
+ 
+-	if (t->base.dirty_images[0]) {
+-		R300_FIREVERTICES(rmesa);
++	if (!radeon_validate_texture_miptree(ctx, texObj))
++		return GL_FALSE;
+ 
+-		r300SetTexImages(rmesa, tObj);
+-		r300UploadTexImages(rmesa, (r300TexObjPtr) tObj->DriverData, 0);
+-		if (!t->base.memBlock && !t->image_override)
+-			return GL_FALSE;
+-	}
++	/* Configure the hardware registers (more precisely, the cached version
++	 * of the hardware registers). */
++	setup_hardware_state(rmesa, t);
+ 
++	t->validated = GL_TRUE;
+ 	return GL_TRUE;
+ }
+ 
+-static GLboolean r300EnableTexture3D(GLcontext * ctx, int unit)
++
++/**
++ * Ensure all enabled and complete textures are uploaded along with any buffers being used.
++ */
++GLboolean r300ValidateBuffers(GLcontext * ctx)
+ {
+ 	r300ContextPtr rmesa = R300_CONTEXT(ctx);
+-	struct gl_texture_unit *texUnit = &ctx->Texture.Unit[unit];
+-	struct gl_texture_object *tObj = texUnit->_Current;
+-	r300TexObjPtr t = (r300TexObjPtr) tObj->DriverData;
+-
+-	ASSERT(tObj->Target == GL_TEXTURE_3D);
+-
+-	/* r300 does not support mipmaps for 3D textures. */
+-	if ((tObj->MinFilter != GL_NEAREST) && (tObj->MinFilter != GL_LINEAR)) {
+-		return GL_FALSE;
++	struct radeon_cs_space_check bos[16];
++	struct radeon_renderbuffer *rrb;
++	int num_bo = 0;
++	int i;
++	int flushed = 0, ret;
++again:
++	num_bo = 0;
++
++	rrb = radeon_get_colorbuffer(&rmesa->radeon);
++	/* color buffer */
++	if (rrb && rrb->bo) {
++		bos[num_bo].bo = rrb->bo;
++		bos[num_bo].read_domains = 0;
++		bos[num_bo].write_domain = RADEON_GEM_DOMAIN_VRAM;
++		bos[num_bo].new_accounted = 0;
++		num_bo++;
+ 	}
+ 
+-	if (t->base.dirty_images[0]) {
+-		R300_FIREVERTICES(rmesa);
+-		r300SetTexImages(rmesa, tObj);
+-		r300UploadTexImages(rmesa, (r300TexObjPtr) tObj->DriverData, 0);
+-		if (!t->base.memBlock)
+-			return GL_FALSE;
++	/* depth buffer */
++	rrb = radeon_get_depthbuffer(&rmesa->radeon);
++	/* color buffer */
++	if (rrb && rrb->bo) {
++		bos[num_bo].bo = rrb->bo;
++		bos[num_bo].read_domains = 0;
++		bos[num_bo].write_domain = RADEON_GEM_DOMAIN_VRAM;
++		bos[num_bo].new_accounted = 0;
++		num_bo++;
+ 	}
++	
++	for (i = 0; i < ctx->Const.MaxTextureImageUnits; ++i) {
++		radeonTexObj *t;
+ 
+-	return GL_TRUE;
+-}
++		if (!ctx->Texture.Unit[i]._ReallyEnabled)
++			continue;
+ 
+-static GLboolean r300EnableTextureCube(GLcontext * ctx, int unit)
+-{
+-	r300ContextPtr rmesa = R300_CONTEXT(ctx);
+-	struct gl_texture_unit *texUnit = &ctx->Texture.Unit[unit];
+-	struct gl_texture_object *tObj = texUnit->_Current;
+-	r300TexObjPtr t = (r300TexObjPtr) tObj->DriverData;
+-	GLuint face;
+-
+-	ASSERT(tObj->Target == GL_TEXTURE_CUBE_MAP);
+-
+-	if (t->base.dirty_images[0] || t->base.dirty_images[1] ||
+-	    t->base.dirty_images[2] || t->base.dirty_images[3] ||
+-	    t->base.dirty_images[4] || t->base.dirty_images[5]) {
+-		/* flush */
+-		R300_FIREVERTICES(rmesa);
+-		/* layout memory space, once for all faces */
+-		r300SetTexImages(rmesa, tObj);
+-	}
+-
+-	/* upload (per face) */
+-	for (face = 0; face < 6; face++) {
+-		if (t->base.dirty_images[face]) {
+-			r300UploadTexImages(rmesa,
+-					    (r300TexObjPtr) tObj->DriverData,
+-					    face);
++		if (!r300_validate_texture(ctx, ctx->Texture.Unit[i]._Current)) {
++			_mesa_warning(ctx,
++				      "failed to validate texture for unit %d.\n",
++				      i);
+ 		}
++		t = radeon_tex_obj(ctx->Texture.Unit[i]._Current);
++		if (t->image_override && t->bo)
++			bos[num_bo].bo = t->bo;
++		else if (t->mt->bo)
++			bos[num_bo].bo = t->mt->bo;
++		bos[num_bo].read_domains = RADEON_GEM_DOMAIN_GTT | RADEON_GEM_DOMAIN_VRAM;
++		bos[num_bo].write_domain = 0;
++		bos[num_bo].new_accounted = 0;
++		num_bo++;
+ 	}
+ 
+-	if (!t->base.memBlock) {
+-		/* texmem alloc failed, use s/w fallback */
++	ret = radeon_cs_space_check(rmesa->radeon.cmdbuf.cs, bos, num_bo);
++	if (ret == RADEON_CS_SPACE_OP_TO_BIG)
+ 		return GL_FALSE;
+-	}
+-
+-	return GL_TRUE;
+-}
+-
+-static GLboolean r300EnableTextureRect(GLcontext * ctx, int unit)
+-{
+-	r300ContextPtr rmesa = R300_CONTEXT(ctx);
+-	struct gl_texture_unit *texUnit = &ctx->Texture.Unit[unit];
+-	struct gl_texture_object *tObj = texUnit->_Current;
+-	r300TexObjPtr t = (r300TexObjPtr) tObj->DriverData;
+-
+-	ASSERT(tObj->Target == GL_TEXTURE_RECTANGLE_NV);
+-
+-	if (t->base.dirty_images[0]) {
+-		R300_FIREVERTICES(rmesa);
+-
+-		r300SetTexImages(rmesa, tObj);
+-		r300UploadTexImages(rmesa, (r300TexObjPtr) tObj->DriverData, 0);
+-		if (!t->base.memBlock && !t->image_override &&
+-		    !rmesa->prefer_gart_client_texturing)
++	if (ret == RADEON_CS_SPACE_FLUSH) {
++		radeonFlush(ctx);
++		if (flushed)
+ 			return GL_FALSE;
++		flushed = 1;
++		goto again;
+ 	}
+-
+ 	return GL_TRUE;
+ }
+ 
+-static GLboolean r300UpdateTexture(GLcontext * ctx, int unit)
+-{
+-	r300ContextPtr rmesa = R300_CONTEXT(ctx);
+-	struct gl_texture_unit *texUnit = &ctx->Texture.Unit[unit];
+-	struct gl_texture_object *tObj = texUnit->_Current;
+-	r300TexObjPtr t = (r300TexObjPtr) tObj->DriverData;
+-
+-	/* Fallback if there's a texture border */
+-	if (tObj->Image[0][tObj->BaseLevel]->Border > 0)
+-		return GL_FALSE;
+-
+-	/* Update state if this is a different texture object to last
+-	 * time.
+-	 */
+-	if (rmesa->state.texture.unit[unit].texobj != t) {
+-		if (rmesa->state.texture.unit[unit].texobj != NULL) {
+-			/* The old texture is no longer bound to this texture unit.
+-			 * Mark it as such.
+-			 */
+-
+-			rmesa->state.texture.unit[unit].texobj->base.bound &=
+-			    ~(1 << unit);
+-		}
+-
+-		rmesa->state.texture.unit[unit].texobj = t;
+-		t->base.bound |= (1 << unit);
+-		driUpdateTextureLRU((driTextureObject *) t);	/* XXX: should be locked! */
+-	}
+-
+-	return !t->border_fallback;
+-}
+-
+ void r300SetTexOffset(__DRIcontext * pDRICtx, GLint texname,
+ 		      unsigned long long offset, GLint depth, GLuint pitch)
+ {
+ 	r300ContextPtr rmesa = pDRICtx->driverPrivate;
+ 	struct gl_texture_object *tObj =
+ 	    _mesa_lookup_texture(rmesa->radeon.glCtx, texname);
+-	r300TexObjPtr t;
++	radeonTexObjPtr t = radeon_tex_obj(tObj);
+ 	uint32_t pitch_val;
+ 
+ 	if (!tObj)
+ 		return;
+ 
+-	t = (r300TexObjPtr) tObj->DriverData;
+-
+ 	t->image_override = GL_TRUE;
+ 
+ 	if (!offset)
+ 		return;
+ 
+-	t->offset = offset;
+-	t->pitch_reg &= (1 << 13) -1;
++	t->bo = NULL;
++	t->override_offset = offset;
++	t->pp_txpitch &= (1 << 13) -1;
+ 	pitch_val = pitch;
+ 
+ 	switch (depth) {
+ 	case 32:
+-		t->format = R300_EASY_TX_FORMAT(X, Y, Z, W, W8Z8Y8X8);
+-		t->filter |= tx_table[2].filter;
++		t->pp_txformat = R300_EASY_TX_FORMAT(X, Y, Z, W, W8Z8Y8X8);
++		t->pp_txfilter |= tx_table[2].filter;
+ 		pitch_val /= 4;
+ 		break;
+ 	case 24:
+ 	default:
+-		t->format = R300_EASY_TX_FORMAT(X, Y, Z, ONE, W8Z8Y8X8);
+-		t->filter |= tx_table[4].filter;
++		t->pp_txformat = R300_EASY_TX_FORMAT(X, Y, Z, ONE, W8Z8Y8X8);
++		t->pp_txfilter |= tx_table[4].filter;
+ 		pitch_val /= 4;
+ 		break;
+ 	case 16:
+-		t->format = R300_EASY_TX_FORMAT(X, Y, Z, ONE, Z5Y6X5);
+-		t->filter |= tx_table[5].filter;
++		t->pp_txformat = R300_EASY_TX_FORMAT(X, Y, Z, ONE, Z5Y6X5);
++		t->pp_txfilter |= tx_table[5].filter;
+ 		pitch_val /= 2;
+ 		break;
+ 	}
+ 	pitch_val--;
+ 
+-	t->pitch_reg |= pitch_val;
++	t->pp_txpitch |= pitch_val;
+ }
+ 
+-static GLboolean r300UpdateTextureUnit(GLcontext * ctx, int unit)
++void r300SetTexBuffer(__DRIcontext *pDRICtx, GLint target, __DRIdrawable *dPriv)
+ {
+-	struct gl_texture_unit *texUnit = &ctx->Texture.Unit[unit];
+-
+-	if (texUnit->_ReallyEnabled & (TEXTURE_RECT_BIT)) {
+-		return (r300EnableTextureRect(ctx, unit) &&
+-			r300UpdateTexture(ctx, unit));
+-	} else if (texUnit->_ReallyEnabled & (TEXTURE_1D_BIT | TEXTURE_2D_BIT)) {
+-		return (r300EnableTexture2D(ctx, unit) &&
+-			r300UpdateTexture(ctx, unit));
+-	} else if (texUnit->_ReallyEnabled & (TEXTURE_3D_BIT)) {
+-		return (r300EnableTexture3D(ctx, unit) &&
+-			r300UpdateTexture(ctx, unit));
+-	} else if (texUnit->_ReallyEnabled & (TEXTURE_CUBE_BIT)) {
+-		return (r300EnableTextureCube(ctx, unit) &&
+-			r300UpdateTexture(ctx, unit));
+-	} else if (texUnit->_ReallyEnabled) {
+-		return GL_FALSE;
+-	} else {
+-		return GL_TRUE;
+-	}
+-}
++	struct gl_texture_unit *texUnit;
++	struct gl_texture_object *texObj;
++	struct gl_texture_image *texImage;
++	struct radeon_renderbuffer *rb;
++	radeon_texture_image *rImage;
++	radeonContextPtr radeon;
++	r300ContextPtr rmesa;
++	GLframebuffer *fb;
++	radeonTexObjPtr t;
++	uint32_t pitch_val;
+ 
+-void r300UpdateTextureState(GLcontext * ctx)
+-{
+-	int i;
++	target = GL_TEXTURE_RECTANGLE_ARB;
+ 
+-	for (i = 0; i < 8; i++) {
+-		if (!r300UpdateTextureUnit(ctx, i)) {
+-			_mesa_warning(ctx,
+-				      "failed to update texture state for unit %d.\n",
+-				      i);
+-		}
++	radeon = pDRICtx->driverPrivate;
++	rmesa = pDRICtx->driverPrivate;
++
++	fb = dPriv->driverPrivate;
++        texUnit = &radeon->glCtx->Texture.Unit[radeon->glCtx->Texture.CurrentUnit];
++	texObj = _mesa_select_tex_object(radeon->glCtx, texUnit, target);
++        texImage = _mesa_get_tex_image(radeon->glCtx, texObj, target, 0);
++
++	rImage = get_radeon_texture_image(texImage);
++	t = radeon_tex_obj(texObj);
++        if (t == NULL) {
++    	    return;
++    	}
++
++	radeon_update_renderbuffers(pDRICtx, dPriv);
++	/* back & depth buffer are useless free them right away */
++	rb = (void*)fb->Attachment[BUFFER_DEPTH].Renderbuffer;
++	if (rb && rb->bo) {
++		radeon_bo_unref(rb->bo);
++        rb->bo = NULL;
++	}
++	rb = (void*)fb->Attachment[BUFFER_BACK_LEFT].Renderbuffer;
++	if (rb && rb->bo) {
++		radeon_bo_unref(rb->bo);
++		rb->bo = NULL;
++	}
++	rb = (void*)fb->Attachment[BUFFER_FRONT_LEFT].Renderbuffer;
++	if (rb->bo == NULL) {
++		/* Failed to BO for the buffer */
++		return;
++	}
++	
++	_mesa_lock_texture(radeon->glCtx, texObj);
++	if (t->bo) {
++		radeon_bo_unref(t->bo);
++		t->bo = NULL;
++	}
++	if (rImage->bo) {
++		radeon_bo_unref(rImage->bo);
++		rImage->bo = NULL;
++	}
++	if (t->mt) {
++		radeon_miptree_unreference(t->mt);
++		t->mt = NULL;
++	}
++	if (rImage->mt) {
++		radeon_miptree_unreference(rImage->mt);
++		rImage->mt = NULL;
++	}
++	fprintf(stderr,"settexbuf %dx%d@%d\n", rb->width, rb->height, rb->cpp);
++	_mesa_init_teximage_fields(radeon->glCtx, target, texImage,
++				   rb->width, rb->height, 1, 0, rb->cpp);
++	texImage->TexFormat = &_mesa_texformat_rgba8888_rev;
++	rImage->bo = rb->bo;
++	radeon_bo_ref(rImage->bo);
++	t->bo = rb->bo;
++	radeon_bo_ref(t->bo);
++	t->tile_bits = 0;
++	t->image_override = GL_TRUE;
++	t->override_offset = 0;
++	t->pp_txpitch &= (1 << 13) -1;
++	pitch_val = rb->pitch;
++	switch (rb->cpp) {
++	case 4:
++		t->pp_txformat = R300_EASY_TX_FORMAT(X, Y, Z, W, W8Z8Y8X8);
++		t->pp_txfilter |= tx_table[2].filter;
++		pitch_val /= 4;
++		break;
++	case 3:
++	default:
++		t->pp_txformat = R300_EASY_TX_FORMAT(X, Y, Z, ONE, W8Z8Y8X8);
++		t->pp_txfilter |= tx_table[4].filter;
++		pitch_val /= 4;
++		break;
++	case 2:
++		t->pp_txformat = R300_EASY_TX_FORMAT(X, Y, Z, ONE, Z5Y6X5);
++		t->pp_txfilter |= tx_table[5].filter;
++		pitch_val /= 2;
++		break;
++	}
++	pitch_val--;
++	t->pp_txsize = ((rb->width - 1) << R300_TX_WIDTHMASK_SHIFT) |
++              ((rb->height - 1) << R300_TX_HEIGHTMASK_SHIFT);
++	t->pp_txsize |= R300_TX_SIZE_TXPITCH_EN;
++	t->pp_txpitch |= pitch_val;
++
++	if (rmesa->radeon.radeonScreen->chip_family >= CHIP_FAMILY_RV515) {
++	    if (rb->width > 2048)
++		t->pp_txpitch |= R500_TXWIDTH_BIT11;
++	    if (rb->height > 2048)
++		t->pp_txpitch |= R500_TXHEIGHT_BIT11;
+ 	}
++	t->validated = GL_TRUE;
++	_mesa_unlock_texture(radeon->glCtx, texObj);
++	return;
+ }
+diff --git a/src/mesa/drivers/dri/r300/r500_fragprog.c b/src/mesa/drivers/dri/r300/r500_fragprog.c
+index 75dae86..926ddd5 100644
+--- a/src/mesa/drivers/dri/r300/r500_fragprog.c
++++ b/src/mesa/drivers/dri/r300/r500_fragprog.c
+@@ -31,6 +31,12 @@
+ #include "radeon_program_alu.h"
+ 
+ 
++static void reset_srcreg(struct prog_src_register* reg)
++{
++	_mesa_bzero(reg, sizeof(*reg));
++	reg->Swizzle = SWIZZLE_NOOP;
++}
++
+ static struct prog_src_register shadow_ambient(struct gl_program *program, int tmu)
+ {
+ 	gl_state_index fail_value_tokens[STATE_LENGTH] = {
+@@ -99,6 +105,19 @@ static GLboolean transform_TEX(
+ 		destredirect = GL_TRUE;
+ 	}
+ 
++	if (inst.SrcReg[0].File != PROGRAM_TEMPORARY && inst.SrcReg[0].File != PROGRAM_INPUT) {
++		int tmpreg = radeonFindFreeTemporary(t);
++		tgt = radeonAppendInstructions(t->Program, 1);
++		tgt->Opcode = OPCODE_MOV;
++		tgt->DstReg.File = PROGRAM_TEMPORARY;
++		tgt->DstReg.Index = tmpreg;
++		tgt->SrcReg[0] = inst.SrcReg[0];
++
++		reset_srcreg(&inst.SrcReg[0]);
++		inst.SrcReg[0].File = PROGRAM_TEMPORARY;
++		inst.SrcReg[0].Index = tmpreg;
++	}
++
+ 	tgt = radeonAppendInstructions(t->Program, 1);
+ 	_mesa_copy_instructions(tgt, &inst, 1);
+ 
+diff --git a/src/mesa/drivers/dri/r300/radeon_context.c b/src/mesa/drivers/dri/r300/radeon_context.c
+deleted file mode 100644
+index 5267fe9..0000000
+--- a/src/mesa/drivers/dri/r300/radeon_context.c
++++ /dev/null
+@@ -1,330 +0,0 @@
+-/*
+-Copyright (C) The Weather Channel, Inc.  2002.  All Rights Reserved.
+-
+-The Weather Channel (TM) funded Tungsten Graphics to develop the
+-initial release of the Radeon 8500 driver under the XFree86 license.
+-This notice must be preserved.
+-
+-Permission is hereby granted, free of charge, to any person obtaining
+-a copy of this software and associated documentation files (the
+-"Software"), to deal in the Software without restriction, including
+-without limitation the rights to use, copy, modify, merge, publish,
+-distribute, sublicense, and/or sell copies of the Software, and to
+-permit persons to whom the Software is furnished to do so, subject to
+-the following conditions:
+-
+-The above copyright notice and this permission notice (including the
+-next paragraph) shall be included in all copies or substantial
+-portions of the Software.
+-
+-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+-EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+-MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+-IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+-LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+-OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+-WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+-
+-**************************************************************************/
+-
+-/**
+- * \file radeon_context.c
+- * Common context initialization.
+- *
+- * \author Keith Whitwell <keith@tungstengraphics.com>
+- */
+-
+-#include <dlfcn.h>
+-
+-#include "main/glheader.h"
+-#include "main/imports.h"
+-#include "main/context.h"
+-#include "main/state.h"
+-#include "main/matrix.h"
+-#include "main/framebuffer.h"
+-
+-#include "drivers/common/driverfuncs.h"
+-#include "swrast/swrast.h"
+-
+-#include "radeon_screen.h"
+-#include "radeon_ioctl.h"
+-#include "radeon_macros.h"
+-#include "radeon_reg.h"
+-
+-#include "radeon_state.h"
+-#include "r300_state.h"
+-
+-#include "utils.h"
+-#include "vblank.h"
+-#include "xmlpool.h"		/* for symbolic values of enum-type options */
+-
+-#define DRIVER_DATE "20060815"
+-
+-
+-/* Return various strings for glGetString().
+- */
+-static const GLubyte *radeonGetString(GLcontext * ctx, GLenum name)
+-{
+-	radeonContextPtr radeon = RADEON_CONTEXT(ctx);
+-	static char buffer[128];
+-
+-	switch (name) {
+-	case GL_VENDOR:
+-		if (IS_R300_CLASS(radeon->radeonScreen))
+-			return (GLubyte *) "DRI R300 Project";
+-		else
+-			return (GLubyte *) "Tungsten Graphics, Inc.";
+-
+-	case GL_RENDERER:
+-	{
+-		unsigned offset;
+-		GLuint agp_mode = (radeon->radeonScreen->card_type==RADEON_CARD_PCI) ? 0 :
+-			radeon->radeonScreen->AGPMode;
+-		const char* chipname;
+-
+-		if (IS_R300_CLASS(radeon->radeonScreen))
+-			chipname = "R300";
+-		else
+-			chipname = "R200";
+-
+-		offset = driGetRendererString(buffer, chipname, DRIVER_DATE,
+-					      agp_mode);
+-
+-		if (IS_R300_CLASS(radeon->radeonScreen)) {
+-		sprintf(&buffer[offset], " %sTCL",
+-			(radeon->radeonScreen->chip_flags & RADEON_CHIPSET_TCL)
+-			? "" : "NO-");
+-		} else {
+-			sprintf(&buffer[offset], " %sTCL",
+-			!(radeon->TclFallback & RADEON_TCL_FALLBACK_TCL_DISABLE)
+-			? "" : "NO-");
+-		}
+-
+-		return (GLubyte *) buffer;
+-	}
+-
+-	default:
+-		return NULL;
+-	}
+-}
+-
+-/* Initialize the driver's misc functions.
+- */
+-static void radeonInitDriverFuncs(struct dd_function_table *functions)
+-{
+-	functions->GetString = radeonGetString;
+-}
+-
+-
+-/**
+- * Create and initialize all common fields of the context,
+- * including the Mesa context itself.
+- */
+-GLboolean radeonInitContext(radeonContextPtr radeon,
+-			    struct dd_function_table* functions,
+-			    const __GLcontextModes * glVisual,
+-			    __DRIcontextPrivate * driContextPriv,
+-			    void *sharedContextPrivate)
+-{
+-	__DRIscreenPrivate *sPriv = driContextPriv->driScreenPriv;
+-	radeonScreenPtr screen = (radeonScreenPtr) (sPriv->private);
+-	GLcontext* ctx;
+-	GLcontext* shareCtx;
+-	int fthrottle_mode;
+-
+-	/* Fill in additional standard functions. */
+-	radeonInitDriverFuncs(functions);
+-
+-	radeon->radeonScreen = screen;
+-	/* Allocate and initialize the Mesa context */
+-	if (sharedContextPrivate)
+-		shareCtx = ((radeonContextPtr)sharedContextPrivate)->glCtx;
+-	else
+-		shareCtx = NULL;
+-	radeon->glCtx = _mesa_create_context(glVisual, shareCtx,
+-					    functions, (void *)radeon);
+-	if (!radeon->glCtx)
+-		return GL_FALSE;
+-
+-	ctx = radeon->glCtx;
+-	driContextPriv->driverPrivate = radeon;
+-
+-	/* DRI fields */
+-	radeon->dri.context = driContextPriv;
+-	radeon->dri.screen = sPriv;
+-	radeon->dri.drawable = NULL;
+-	radeon->dri.readable = NULL;
+-	radeon->dri.hwContext = driContextPriv->hHWContext;
+-	radeon->dri.hwLock = &sPriv->pSAREA->lock;
+-	radeon->dri.fd = sPriv->fd;
+-	radeon->dri.drmMinor = sPriv->drm_version.minor;
+-
+-	radeon->sarea = (drm_radeon_sarea_t *) ((GLubyte *) sPriv->pSAREA +
+-					       screen->sarea_priv_offset);
+-
+-	/* Setup IRQs */
+-	fthrottle_mode = driQueryOptioni(&radeon->optionCache, "fthrottle_mode");
+-	radeon->iw.irq_seq = -1;
+-	radeon->irqsEmitted = 0;
+-	radeon->do_irqs = (fthrottle_mode == DRI_CONF_FTHROTTLE_IRQS &&
+-			  radeon->radeonScreen->irq);
+-
+-	radeon->do_usleeps = (fthrottle_mode == DRI_CONF_FTHROTTLE_USLEEPS);
+-
+-	if (!radeon->do_irqs)
+-		fprintf(stderr,
+-			"IRQ's not enabled, falling back to %s: %d %d\n",
+-			radeon->do_usleeps ? "usleeps" : "busy waits",
+-			fthrottle_mode, radeon->radeonScreen->irq);
+-
+-	(*sPriv->systemTime->getUST) (&radeon->swap_ust);
+-
+-	return GL_TRUE;
+-}
+-
+-
+-/**
+- * Cleanup common context fields.
+- * Called by r200DestroyContext/r300DestroyContext
+- */
+-void radeonCleanupContext(radeonContextPtr radeon)
+-{
+-	/* _mesa_destroy_context() might result in calls to functions that
+-	 * depend on the DriverCtx, so don't set it to NULL before.
+-	 *
+-	 * radeon->glCtx->DriverCtx = NULL;
+-	 */
+-
+-	/* free the Mesa context */
+-	_mesa_destroy_context(radeon->glCtx);
+-
+-	if (radeon->state.scissor.pClipRects) {
+-		FREE(radeon->state.scissor.pClipRects);
+-		radeon->state.scissor.pClipRects = 0;
+-	}
+-}
+-
+-
+-/**
+- * Swap front and back buffer.
+- */
+-void radeonSwapBuffers(__DRIdrawablePrivate * dPriv)
+-{
+-	if (dPriv->driContextPriv && dPriv->driContextPriv->driverPrivate) {
+-		radeonContextPtr radeon;
+-		GLcontext *ctx;
+-
+-		radeon = (radeonContextPtr) dPriv->driContextPriv->driverPrivate;
+-		ctx = radeon->glCtx;
+-
+-		if (ctx->Visual.doubleBufferMode) {
+-			_mesa_notifySwapBuffers(ctx);	/* flush pending rendering comands */
+-			if (radeon->doPageFlip) {
+-				radeonPageFlip(dPriv);
+-			} else {
+-			    radeonCopyBuffer(dPriv, NULL);
+-			}
+-		}
+-	} else {
+-		/* XXX this shouldn't be an error but we can't handle it for now */
+-		_mesa_problem(NULL, "%s: drawable has no context!",
+-			      __FUNCTION__);
+-	}
+-}
+-
+-void radeonCopySubBuffer(__DRIdrawablePrivate * dPriv,
+-			 int x, int y, int w, int h )
+-{
+-    if (dPriv->driContextPriv && dPriv->driContextPriv->driverPrivate) {
+-	radeonContextPtr radeon;
+-	GLcontext *ctx;
+-
+-	radeon = (radeonContextPtr) dPriv->driContextPriv->driverPrivate;
+-	ctx = radeon->glCtx;
+-
+-	if (ctx->Visual.doubleBufferMode) {
+-	    drm_clip_rect_t rect;
+-	    rect.x1 = x + dPriv->x;
+-	    rect.y1 = (dPriv->h - y - h) + dPriv->y;
+-	    rect.x2 = rect.x1 + w;
+-	    rect.y2 = rect.y1 + h;
+-	    _mesa_notifySwapBuffers(ctx);	/* flush pending rendering comands */
+-	    radeonCopyBuffer(dPriv, &rect);
+-	}
+-    } else {
+-	/* XXX this shouldn't be an error but we can't handle it for now */
+-	_mesa_problem(NULL, "%s: drawable has no context!",
+-		      __FUNCTION__);
+-    }
+-}
+-
+-/* Force the context `c' to be the current context and associate with it
+- * buffer `b'.
+- */
+-GLboolean radeonMakeCurrent(__DRIcontextPrivate * driContextPriv,
+-			    __DRIdrawablePrivate * driDrawPriv,
+-			    __DRIdrawablePrivate * driReadPriv)
+-{
+-	if (driContextPriv) {
+-		radeonContextPtr radeon =
+-			(radeonContextPtr) driContextPriv->driverPrivate;
+-
+-		if (RADEON_DEBUG & DEBUG_DRI)
+-			fprintf(stderr, "%s ctx %p\n", __FUNCTION__,
+-				radeon->glCtx);
+-
+-		if (radeon->dri.drawable != driDrawPriv) {
+-			if (driDrawPriv->swap_interval == (unsigned)-1) {
+-				driDrawPriv->vblFlags =
+-					(radeon->radeonScreen->irq != 0)
+-					? driGetDefaultVBlankFlags(&radeon->
+-								   optionCache)
+-					: VBLANK_FLAG_NO_IRQ;
+-
+-				driDrawableInitVBlank(driDrawPriv);
+-			}
+-		}
+-
+-		radeon->dri.readable = driReadPriv;
+-
+-		if (radeon->dri.drawable != driDrawPriv ||
+-		    radeon->lastStamp != driDrawPriv->lastStamp) {
+-			radeon->dri.drawable = driDrawPriv;
+-
+-			radeonSetCliprects(radeon);
+-			r300UpdateViewportOffset(radeon->glCtx);
+-		}
+-
+-		_mesa_make_current(radeon->glCtx,
+-				    (GLframebuffer *) driDrawPriv->
+-				    driverPrivate,
+-				    (GLframebuffer *) driReadPriv->
+-				    driverPrivate);
+-
+-		_mesa_update_state(radeon->glCtx);		
+-
+-		radeonUpdatePageFlipping(radeon);
+-	} else {
+-		if (RADEON_DEBUG & DEBUG_DRI)
+-			fprintf(stderr, "%s ctx is null\n", __FUNCTION__);
+-		_mesa_make_current(0, 0, 0);
+-	}
+-
+-	if (RADEON_DEBUG & DEBUG_DRI)
+-		fprintf(stderr, "End %s\n", __FUNCTION__);
+-	return GL_TRUE;
+-}
+-
+-/* Force the context `c' to be unbound from its buffer.
+- */
+-GLboolean radeonUnbindContext(__DRIcontextPrivate * driContextPriv)
+-{
+-	radeonContextPtr radeon = (radeonContextPtr) driContextPriv->driverPrivate;
+-
+-	if (RADEON_DEBUG & DEBUG_DRI)
+-		fprintf(stderr, "%s ctx %p\n", __FUNCTION__,
+-			radeon->glCtx);
+-
+-	return GL_TRUE;
+-}
+-
+diff --git a/src/mesa/drivers/dri/r300/radeon_context.h b/src/mesa/drivers/dri/r300/radeon_context.h
+index 47cbc22..250570f 100644
+--- a/src/mesa/drivers/dri/r300/radeon_context.h
++++ b/src/mesa/drivers/dri/r300/radeon_context.h
+@@ -49,20 +49,7 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ #include "drm.h"
+ #include "dri_util.h"
+ 
+-struct radeon_context;
+-typedef struct radeon_context radeonContextRec;
+-typedef struct radeon_context *radeonContextPtr;
+-
+-/* Rasterizing fallbacks */
+-/* See correponding strings in r200_swtcl.c */
+-#define RADEON_FALLBACK_TEXTURE		0x0001
+-#define RADEON_FALLBACK_DRAW_BUFFER	0x0002
+-#define RADEON_FALLBACK_STENCIL		0x0004
+-#define RADEON_FALLBACK_RENDER_MODE	0x0008
+-#define RADEON_FALLBACK_BLEND_EQ	0x0010
+-#define RADEON_FALLBACK_BLEND_FUNC	0x0020
+-#define RADEON_FALLBACK_DISABLE		0x0040
+-#define RADEON_FALLBACK_BORDER_MODE	0x0080
++#include "radeon_screen.h"
+ 
+ #if R200_MERGED
+ extern void radeonFallback(GLcontext * ctx, GLuint bit, GLboolean mode);
+@@ -79,155 +66,11 @@ extern void radeonFallback(GLcontext * ctx, GLuint bit, GLboolean mode);
+ /* TCL fallbacks */
+ extern void radeonTclFallback(GLcontext * ctx, GLuint bit, GLboolean mode);
+ 
+-#define RADEON_TCL_FALLBACK_RASTER		0x0001	/* rasterization */
+-#define RADEON_TCL_FALLBACK_UNFILLED		0x0002	/* unfilled tris */
+-#define RADEON_TCL_FALLBACK_LIGHT_TWOSIDE	0x0004	/* twoside tris */
+-#define RADEON_TCL_FALLBACK_MATERIAL		0x0008	/* material in vb */
+-#define RADEON_TCL_FALLBACK_TEXGEN_0		0x0010	/* texgen, unit 0 */
+-#define RADEON_TCL_FALLBACK_TEXGEN_1		0x0020	/* texgen, unit 1 */
+-#define RADEON_TCL_FALLBACK_TEXGEN_2		0x0040	/* texgen, unit 2 */
+-#define RADEON_TCL_FALLBACK_TEXGEN_3		0x0080	/* texgen, unit 3 */
+-#define RADEON_TCL_FALLBACK_TEXGEN_4		0x0100	/* texgen, unit 4 */
+-#define RADEON_TCL_FALLBACK_TEXGEN_5		0x0200	/* texgen, unit 5 */
+-#define RADEON_TCL_FALLBACK_TCL_DISABLE		0x0400	/* user disable */
+-#define RADEON_TCL_FALLBACK_BITMAP		0x0800	/* draw bitmap with points */
+-#define RADEON_TCL_FALLBACK_VERTEX_PROGRAM	0x1000	/* vertex program active */
+-
+ #if R200_MERGED
+ #define TCL_FALLBACK( ctx, bit, mode )	radeonTclFallback( ctx, bit, mode )
+ #else
+ #define TCL_FALLBACK( ctx, bit, mode )	;
+ #endif
+ 
+-struct radeon_dri_mirror {
+-	__DRIcontextPrivate *context;	/* DRI context */
+-	__DRIscreenPrivate *screen;	/* DRI screen */
+-	/**
+-	 * DRI drawable bound to this context for drawing.
+-	 */
+-	__DRIdrawablePrivate *drawable;
+-
+-	/**
+-	 * DRI drawable bound to this context for reading.
+-	 */
+-	__DRIdrawablePrivate *readable;
+-
+-	drm_context_t hwContext;
+-	drm_hw_lock_t *hwLock;
+-	int fd;
+-	int drmMinor;
+-};
+-
+-/**
+- * Derived state for internal purposes.
+- */
+-struct radeon_scissor_state {
+-	drm_clip_rect_t rect;
+-	GLboolean enabled;
+-
+-	GLuint numClipRects;	/* Cliprects active */
+-	GLuint numAllocedClipRects;	/* Cliprects available */
+-	drm_clip_rect_t *pClipRects;
+-};
+-
+-struct radeon_colorbuffer_state {
+-	GLuint clear;
+-	GLint drawOffset, drawPitch;
+-};
+-
+-struct radeon_state {
+-	struct radeon_colorbuffer_state color;
+-	struct radeon_scissor_state scissor;
+-};
+-
+-/**
+- * Common per-context variables shared by R200 and R300.
+- * R200- and R300-specific code "derive" their own context from this
+- * structure.
+- */
+-struct radeon_context {
+-	GLcontext *glCtx;	/* Mesa context */
+-	radeonScreenPtr radeonScreen;	/* Screen private DRI data */
+-
+-	/* Fallback state */
+-	GLuint Fallback;
+-	GLuint TclFallback;
+-
+-	/* Page flipping */
+-	GLuint doPageFlip;
+-
+-	/* Drawable, cliprect and scissor information */
+-	GLuint numClipRects;	/* Cliprects for the draw buffer */
+-	drm_clip_rect_t *pClipRects;
+-	unsigned int lastStamp;
+-	GLboolean lost_context;
+-	drm_radeon_sarea_t *sarea;	/* Private SAREA data */
+-
+-	/* Mirrors of some DRI state */
+-	struct radeon_dri_mirror dri;
+-
+-	/* Busy waiting */
+-	GLuint do_usleeps;
+-	GLuint do_irqs;
+-	GLuint irqsEmitted;
+-	drm_radeon_irq_wait_t iw;
+-
+-	/* buffer swap */
+-	int64_t swap_ust;
+-	int64_t swap_missed_ust;
+-
+-	GLuint swap_count;
+-	GLuint swap_missed_count;
+-
+-	/* Derived state */
+-	struct radeon_state state;
+-
+-	/* Configuration cache
+-	 */
+-	driOptionCache optionCache;
+-};
+-
+-#define RADEON_CONTEXT(glctx) ((radeonContextPtr)(ctx->DriverCtx))
+-
+-extern void radeonSwapBuffers(__DRIdrawablePrivate * dPriv);
+-extern void radeonCopySubBuffer(__DRIdrawablePrivate * dPriv,
+-				int x, int y, int w, int h);
+-extern GLboolean radeonInitContext(radeonContextPtr radeon,
+-				   struct dd_function_table *functions,
+-				   const __GLcontextModes * glVisual,
+-				   __DRIcontextPrivate * driContextPriv,
+-				   void *sharedContextPrivate);
+-extern void radeonCleanupContext(radeonContextPtr radeon);
+-extern GLboolean radeonMakeCurrent(__DRIcontextPrivate * driContextPriv,
+-				   __DRIdrawablePrivate * driDrawPriv,
+-				   __DRIdrawablePrivate * driReadPriv);
+-extern GLboolean radeonUnbindContext(__DRIcontextPrivate * driContextPriv);
+-
+-/* ================================================================
+- * Debugging:
+- */
+-#define DO_DEBUG		1
+-
+-#if DO_DEBUG
+-extern int RADEON_DEBUG;
+-#else
+-#define RADEON_DEBUG		0
+-#endif
+-
+-#define DEBUG_TEXTURE	0x0001
+-#define DEBUG_STATE	0x0002
+-#define DEBUG_IOCTL	0x0004
+-#define DEBUG_PRIMS	0x0008
+-#define DEBUG_VERTS	0x0010
+-#define DEBUG_FALLBACKS	0x0020
+-#define DEBUG_VFMT	0x0040
+-#define DEBUG_CODEGEN	0x0080
+-#define DEBUG_VERBOSE	0x0100
+-#define DEBUG_DRI       0x0200
+-#define DEBUG_DMA       0x0400
+-#define DEBUG_SANITY    0x0800
+-#define DEBUG_SYNC      0x1000
+-#define DEBUG_PIXEL     0x2000
+-#define DEBUG_MEMORY    0x4000
+ 
+ #endif				/* __RADEON_CONTEXT_H__ */
+diff --git a/src/mesa/drivers/dri/r300/radeon_ioctl.c b/src/mesa/drivers/dri/r300/radeon_ioctl.c
+deleted file mode 100644
+index 36502eb..0000000
+--- a/src/mesa/drivers/dri/r300/radeon_ioctl.c
++++ /dev/null
+@@ -1,396 +0,0 @@
+-/*
+-Copyright (C) The Weather Channel, Inc.  2002.  All Rights Reserved.
+-
+-The Weather Channel (TM) funded Tungsten Graphics to develop the
+-initial release of the Radeon 8500 driver under the XFree86 license.
+-This notice must be preserved.
+-
+-Permission is hereby granted, free of charge, to any person obtaining
+-a copy of this software and associated documentation files (the
+-"Software"), to deal in the Software without restriction, including
+-without limitation the rights to use, copy, modify, merge, publish,
+-distribute, sublicense, and/or sell copies of the Software, and to
+-permit persons to whom the Software is furnished to do so, subject to
+-the following conditions:
+-
+-The above copyright notice and this permission notice (including the
+-next paragraph) shall be included in all copies or substantial
+-portions of the Software.
+-
+-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+-EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+-MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+-IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+-LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+-OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+-WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+-
+-**************************************************************************/
+-
+-/*
+- * Authors:
+- *   Keith Whitwell <keith@tungstengraphics.com>
+- */
+-
+-#include <sched.h>
+-#include <errno.h>
+-
+-#include "main/glheader.h"
+-#include "main/imports.h"
+-#include "main/macros.h"
+-#include "main/context.h"
+-#include "swrast/swrast.h"
+-#include "r300_context.h"
+-#include "radeon_ioctl.h"
+-#include "r300_ioctl.h"
+-#include "r300_state.h"
+-#include "radeon_reg.h"
+-
+-#include "drirenderbuffer.h"
+-#include "vblank.h"
+-
+-static void radeonWaitForIdle(radeonContextPtr radeon);
+-
+-/* ================================================================
+- * SwapBuffers with client-side throttling
+- */
+-
+-static uint32_t radeonGetLastFrame(radeonContextPtr radeon)
+-{
+-	drm_radeon_getparam_t gp;
+-	int ret;
+-	uint32_t frame;
+-
+-	gp.param = RADEON_PARAM_LAST_FRAME;
+-	gp.value = (int *)&frame;
+-	ret = drmCommandWriteRead(radeon->dri.fd, DRM_RADEON_GETPARAM,
+-				  &gp, sizeof(gp));
+-	if (ret) {
+-		fprintf(stderr, "%s: drmRadeonGetParam: %d\n", __FUNCTION__,
+-			ret);
+-		exit(1);
+-	}
+-
+-	return frame;
+-}
+-
+-uint32_t radeonGetAge(radeonContextPtr radeon)
+-{
+-	drm_radeon_getparam_t gp;
+-	int ret;
+-	uint32_t age;
+-
+-	gp.param = RADEON_PARAM_LAST_CLEAR;
+-	gp.value = (int *)&age;
+-	ret = drmCommandWriteRead(radeon->dri.fd, DRM_RADEON_GETPARAM,
+-				  &gp, sizeof(gp));
+-	if (ret) {
+-		fprintf(stderr, "%s: drmRadeonGetParam: %d\n", __FUNCTION__,
+-			ret);
+-		exit(1);
+-	}
+-
+-	return age;
+-}
+-
+-static void radeonEmitIrqLocked(radeonContextPtr radeon)
+-{
+-	drm_radeon_irq_emit_t ie;
+-	int ret;
+-
+-	ie.irq_seq = &radeon->iw.irq_seq;
+-	ret = drmCommandWriteRead(radeon->dri.fd, DRM_RADEON_IRQ_EMIT,
+-				  &ie, sizeof(ie));
+-	if (ret) {
+-		fprintf(stderr, "%s: drmRadeonIrqEmit: %d\n", __FUNCTION__,
+-			ret);
+-		exit(1);
+-	}
+-}
+-
+-static void radeonWaitIrq(radeonContextPtr radeon)
+-{
+-	int ret;
+-
+-	do {
+-		ret = drmCommandWrite(radeon->dri.fd, DRM_RADEON_IRQ_WAIT,
+-				      &radeon->iw, sizeof(radeon->iw));
+-	} while (ret && (errno == EINTR || errno == EBUSY));
+-
+-	if (ret) {
+-		fprintf(stderr, "%s: drmRadeonIrqWait: %d\n", __FUNCTION__,
+-			ret);
+-		exit(1);
+-	}
+-}
+-
+-static void radeonWaitForFrameCompletion(radeonContextPtr radeon)
+-{
+-	drm_radeon_sarea_t *sarea = radeon->sarea;
+-
+-	if (radeon->do_irqs) {
+-		if (radeonGetLastFrame(radeon) < sarea->last_frame) {
+-			if (!radeon->irqsEmitted) {
+-				while (radeonGetLastFrame(radeon) <
+-				       sarea->last_frame) ;
+-			} else {
+-				UNLOCK_HARDWARE(radeon);
+-				radeonWaitIrq(radeon);
+-				LOCK_HARDWARE(radeon);
+-			}
+-			radeon->irqsEmitted = 10;
+-		}
+-
+-		if (radeon->irqsEmitted) {
+-			radeonEmitIrqLocked(radeon);
+-			radeon->irqsEmitted--;
+-		}
+-	} else {
+-		while (radeonGetLastFrame(radeon) < sarea->last_frame) {
+-			UNLOCK_HARDWARE(radeon);
+-			if (radeon->do_usleeps)
+-				DO_USLEEP(1);
+-			LOCK_HARDWARE(radeon);
+-		}
+-	}
+-}
+-
+-/* Copy the back color buffer to the front color buffer.
+- */
+-void radeonCopyBuffer(__DRIdrawablePrivate * dPriv,
+-		      const drm_clip_rect_t	 * rect)
+-{
+-	radeonContextPtr radeon;
+-	GLint nbox, i, ret;
+-	GLboolean missed_target;
+-	int64_t ust;
+-	__DRIscreenPrivate *psp = dPriv->driScreenPriv;
+-
+-	assert(dPriv);
+-	assert(dPriv->driContextPriv);
+-	assert(dPriv->driContextPriv->driverPrivate);
+-
+-	radeon = (radeonContextPtr) dPriv->driContextPriv->driverPrivate;
+-
+-	if (RADEON_DEBUG & DEBUG_IOCTL) {
+-		fprintf(stderr, "\n%s( %p )\n\n", __FUNCTION__,
+-			(void *)radeon->glCtx);
+-	}
+-
+-	r300Flush(radeon->glCtx);
+-
+-	LOCK_HARDWARE(radeon);
+-
+-	/* Throttle the frame rate -- only allow one pending swap buffers
+-	 * request at a time.
+-	 */
+-	radeonWaitForFrameCompletion(radeon);
+-	if (!rect)
+-	{
+-	    UNLOCK_HARDWARE(radeon);
+-	    driWaitForVBlank(dPriv, &missed_target);
+-	    LOCK_HARDWARE(radeon);
+-	}
+-
+-	nbox = dPriv->numClipRects;	/* must be in locked region */
+-
+-	for (i = 0; i < nbox;) {
+-		GLint nr = MIN2(i + RADEON_NR_SAREA_CLIPRECTS, nbox);
+-		drm_clip_rect_t *box = dPriv->pClipRects;
+-		drm_clip_rect_t *b = radeon->sarea->boxes;
+-		GLint n = 0;
+-
+-		for ( ; i < nr ; i++ ) {
+-
+-		    *b = box[i];
+-
+-		    if (rect)
+-		    {
+-			if (rect->x1 > b->x1)
+-			    b->x1 = rect->x1;
+-			if (rect->y1 > b->y1)
+-			    b->y1 = rect->y1;
+-			if (rect->x2 < b->x2)
+-			    b->x2 = rect->x2;
+-			if (rect->y2 < b->y2)
+-			    b->y2 = rect->y2;
+-
+-			if (b->x1 >= b->x2 || b->y1 >= b->y2)
+-			    continue;
+-		    }
+-
+-		    b++;
+-		    n++;
+-		}
+-		radeon->sarea->nbox = n;
+-
+-		if (!n)
+-		   continue;
+-
+-		ret = drmCommandNone(radeon->dri.fd, DRM_RADEON_SWAP);
+-
+-		if (ret) {
+-			fprintf(stderr, "DRM_RADEON_SWAP: return = %d\n",
+-				ret);
+-			UNLOCK_HARDWARE(radeon);
+-			exit(1);
+-		}
+-	}
+-
+-	UNLOCK_HARDWARE(radeon);
+-	if (!rect)
+-	{
+-	    ((r300ContextPtr)radeon)->hw.all_dirty = GL_TRUE;
+-
+-	    radeon->swap_count++;
+-	    (*psp->systemTime->getUST) (&ust);
+-	    if (missed_target) {
+-		radeon->swap_missed_count++;
+-		radeon->swap_missed_ust = ust - radeon->swap_ust;
+-	    }
+-
+-	    radeon->swap_ust = ust;
+-
+-	    sched_yield();
+-	}
+-}
+-
+-void radeonPageFlip(__DRIdrawablePrivate * dPriv)
+-{
+-	radeonContextPtr radeon;
+-	GLint ret;
+-	GLboolean missed_target;
+-	__DRIscreenPrivate *psp = dPriv->driScreenPriv;
+-
+-	assert(dPriv);
+-	assert(dPriv->driContextPriv);
+-	assert(dPriv->driContextPriv->driverPrivate);
+-
+-	radeon = (radeonContextPtr) dPriv->driContextPriv->driverPrivate;
+-
+-	if (RADEON_DEBUG & DEBUG_IOCTL) {
+-		fprintf(stderr, "%s: pfCurrentPage: %d\n", __FUNCTION__,
+-			radeon->sarea->pfCurrentPage);
+-	}
+-
+-	r300Flush(radeon->glCtx);
+-	LOCK_HARDWARE(radeon);
+-
+-	if (!dPriv->numClipRects) {
+-		UNLOCK_HARDWARE(radeon);
+-		usleep(10000);	/* throttle invisible client 10ms */
+-		return;
+-	}
+-
+-	/* Need to do this for the perf box placement:
+-	 */
+-	{
+-		drm_clip_rect_t *box = dPriv->pClipRects;
+-		drm_clip_rect_t *b = radeon->sarea->boxes;
+-		b[0] = box[0];
+-		radeon->sarea->nbox = 1;
+-	}
+-
+-	/* Throttle the frame rate -- only allow a few pending swap buffers
+-	 * request at a time.
+-	 */
+-	radeonWaitForFrameCompletion(radeon);
+-	UNLOCK_HARDWARE(radeon);
+-	driWaitForVBlank(dPriv, &missed_target);
+-	if (missed_target) {
+-		radeon->swap_missed_count++;
+-		(void)(*psp->systemTime->getUST) (&radeon->swap_missed_ust);
+-	}
+-	LOCK_HARDWARE(radeon);
+-
+-	ret = drmCommandNone(radeon->dri.fd, DRM_RADEON_FLIP);
+-
+-	UNLOCK_HARDWARE(radeon);
+-
+-	if (ret) {
+-		fprintf(stderr, "DRM_RADEON_FLIP: return = %d\n", ret);
+-		exit(1);
+-	}
+-
+-	radeon->swap_count++;
+-	(void)(*psp->systemTime->getUST) (&radeon->swap_ust);
+-
+-        driFlipRenderbuffers(radeon->glCtx->WinSysDrawBuffer, 
+-                             radeon->sarea->pfCurrentPage);
+-
+-	if (radeon->sarea->pfCurrentPage == 1) {
+-		radeon->state.color.drawOffset = radeon->radeonScreen->frontOffset;
+-		radeon->state.color.drawPitch = radeon->radeonScreen->frontPitch;
+-	} else {
+-		radeon->state.color.drawOffset = radeon->radeonScreen->backOffset;
+-		radeon->state.color.drawPitch = radeon->radeonScreen->backPitch;
+-	}
+-
+-	if (IS_R300_CLASS(radeon->radeonScreen)) {
+-		r300ContextPtr r300 = (r300ContextPtr)radeon;
+-		R300_STATECHANGE(r300, cb);
+-		r300->hw.cb.cmd[R300_CB_OFFSET] = r300->radeon.state.color.drawOffset + 
+-						r300->radeon.radeonScreen->fbLocation;
+-		r300->hw.cb.cmd[R300_CB_PITCH] = r300->radeon.state.color.drawPitch;
+-		
+-		if (r300->radeon.radeonScreen->cpp == 4)
+-			r300->hw.cb.cmd[R300_CB_PITCH] |= R300_COLOR_FORMAT_ARGB8888;
+-		else
+-			r300->hw.cb.cmd[R300_CB_PITCH] |= R300_COLOR_FORMAT_RGB565;
+-	
+-		if (r300->radeon.sarea->tiling_enabled)
+-			r300->hw.cb.cmd[R300_CB_PITCH] |= R300_COLOR_TILE_ENABLE;
+-	}
+-}
+-
+-void radeonWaitForIdleLocked(radeonContextPtr radeon)
+-{
+-	int ret;
+-	int i = 0;
+-
+-	do {
+-		ret = drmCommandNone(radeon->dri.fd, DRM_RADEON_CP_IDLE);
+-		if (ret)
+-			DO_USLEEP(1);
+-	} while (ret && ++i < 100);
+-
+-	if (ret < 0) {
+-		UNLOCK_HARDWARE(radeon);
+-		fprintf(stderr, "Error: R300 timed out... exiting\n");
+-		exit(-1);
+-	}
+-}
+-
+-static void radeonWaitForIdle(radeonContextPtr radeon)
+-{
+-	LOCK_HARDWARE(radeon);
+-	radeonWaitForIdleLocked(radeon);
+-	UNLOCK_HARDWARE(radeon);
+-}
+-
+-void radeonFlush(GLcontext * ctx)
+-{
+-	radeonContextPtr radeon = RADEON_CONTEXT(ctx);
+-
+-	if (IS_R300_CLASS(radeon->radeonScreen))
+-		r300Flush(ctx);
+-}
+-
+-
+-/* Make sure all commands have been sent to the hardware and have
+- * completed processing.
+- */
+-void radeonFinish(GLcontext * ctx)
+-{
+-	radeonContextPtr radeon = RADEON_CONTEXT(ctx);
+-
+-	radeonFlush(ctx);
+-
+-	if (radeon->do_irqs) {
+-		LOCK_HARDWARE(radeon);
+-		radeonEmitIrqLocked(radeon);
+-		UNLOCK_HARDWARE(radeon);
+-		radeonWaitIrq(radeon);
+-	} else
+-		radeonWaitForIdle(radeon);
+-}
+diff --git a/src/mesa/drivers/dri/r300/radeon_ioctl.h b/src/mesa/drivers/dri/r300/radeon_ioctl.h
+deleted file mode 100644
+index 3add775..0000000
+--- a/src/mesa/drivers/dri/r300/radeon_ioctl.h
++++ /dev/null
+@@ -1,57 +0,0 @@
+-/*
+-Copyright (C) The Weather Channel, Inc.  2002.  All Rights Reserved.
+-
+-The Weather Channel (TM) funded Tungsten Graphics to develop the
+-initial release of the Radeon 8500 driver under the XFree86 license.
+-This notice must be preserved.
+-
+-Permission is hereby granted, free of charge, to any person obtaining
+-a copy of this software and associated documentation files (the
+-"Software"), to deal in the Software without restriction, including
+-without limitation the rights to use, copy, modify, merge, publish,
+-distribute, sublicense, and/or sell copies of the Software, and to
+-permit persons to whom the Software is furnished to do so, subject to
+-the following conditions:
+-
+-The above copyright notice and this permission notice (including the
+-next paragraph) shall be included in all copies or substantial
+-portions of the Software.
+-
+-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+-EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+-MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+-IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+-LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+-OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+-WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+-
+-**************************************************************************/
+-
+-/*
+- * Authors:
+- *   Keith Whitwell <keith@tungstengraphics.com>
+- */
+-
+-#ifndef __RADEON_IOCTL_H__
+-#define __RADEON_IOCTL_H__
+-
+-#include "main/simple_list.h"
+-#include "radeon_dri.h"
+-#include "radeon_lock.h"
+-
+-#include "xf86drm.h"
+-#include "drm.h"
+-#if 0
+-#include "r200context.h"
+-#endif
+-#include "radeon_drm.h"
+-
+-extern void radeonCopyBuffer(__DRIdrawablePrivate * drawable,
+-			     const drm_clip_rect_t	* rect);
+-extern void radeonPageFlip(__DRIdrawablePrivate * drawable);
+-extern void radeonFlush(GLcontext * ctx);
+-extern void radeonFinish(GLcontext * ctx);
+-extern void radeonWaitForIdleLocked(radeonContextPtr radeon);
+-extern uint32_t radeonGetAge(radeonContextPtr radeon);
+-
+-#endif				/* __RADEON_IOCTL_H__ */
+diff --git a/src/mesa/drivers/dri/r300/radeon_lock.c b/src/mesa/drivers/dri/r300/radeon_lock.c
+deleted file mode 100644
+index 4f47afd..af4108a
+--- a/src/mesa/drivers/dri/r300/radeon_lock.c
++++ /dev/null
+@@ -1,137 +0,0 @@
+-/**************************************************************************
+-
+-Copyright 2000, 2001 ATI Technologies Inc., Ontario, Canada, and
+-                     VA Linux Systems Inc., Fremont, California.
+-Copyright (C) The Weather Channel, Inc.  2002.  All Rights Reserved.
+-
+-The Weather Channel (TM) funded Tungsten Graphics to develop the
+-initial release of the Radeon 8500 driver under the XFree86 license.
+-This notice must be preserved.
+-
+-All Rights Reserved.
+-
+-Permission is hereby granted, free of charge, to any person obtaining
+-a copy of this software and associated documentation files (the
+-"Software"), to deal in the Software without restriction, including
+-without limitation the rights to use, copy, modify, merge, publish,
+-distribute, sublicense, and/or sell copies of the Software, and to
+-permit persons to whom the Software is furnished to do so, subject to
+-the following conditions:
+-
+-The above copyright notice and this permission notice (including the
+-next paragraph) shall be included in all copies or substantial
+-portions of the Software.
+-
+-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+-EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+-MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+-IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+-LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+-OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+-WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+-
+-**************************************************************************/
+-
+-/*
+- * Authors:
+- *   Gareth Hughes <gareth@valinux.com>
+- *   Keith Whitwell <keith@tungstengraphics.com>
+- *   Kevin E. Martin <martin@valinux.com>
+- */
+-
+-#include "radeon_lock.h"
+-#include "radeon_ioctl.h"
+-#include "radeon_state.h"
+-#include "r300_context.h"
+-#include "r300_state.h"
+-
+-#include "main/framebuffer.h"
+-
+-#include "drirenderbuffer.h"
+-
+-#if DEBUG_LOCKING
+-char *prevLockFile = NULL;
+-int prevLockLine = 0;
+-#endif
+-
+-/* Turn on/off page flipping according to the flags in the sarea:
+- */
+-void radeonUpdatePageFlipping(radeonContextPtr rmesa)
+-{
+-	int use_back;
+-
+-	rmesa->doPageFlip = rmesa->sarea->pfState;
+-	if (rmesa->glCtx->WinSysDrawBuffer) {
+-		driFlipRenderbuffers(rmesa->glCtx->WinSysDrawBuffer,
+-				     rmesa->sarea->pfCurrentPage);
+-		r300UpdateDrawBuffer(rmesa->glCtx);
+-	}
+-
+-	use_back = rmesa->glCtx->DrawBuffer ?
+-	    (rmesa->glCtx->DrawBuffer->_ColorDrawBufferIndexes[0] ==
+-	     BUFFER_BACK_LEFT) : 1;
+-	use_back ^= (rmesa->sarea->pfCurrentPage == 1);
+-
+-	if (use_back) {
+-		rmesa->state.color.drawOffset =
+-		    rmesa->radeonScreen->backOffset;
+-		rmesa->state.color.drawPitch = rmesa->radeonScreen->backPitch;
+-	} else {
+-		rmesa->state.color.drawOffset =
+-		    rmesa->radeonScreen->frontOffset;
+-		rmesa->state.color.drawPitch =
+-		    rmesa->radeonScreen->frontPitch;
+-	}
+-}
+-
+-/* Update the hardware state.  This is called if another context has
+- * grabbed the hardware lock, which includes the X server.  This
+- * function also updates the driver's window state after the X server
+- * moves, resizes or restacks a window -- the change will be reflected
+- * in the drawable position and clip rects.  Since the X server grabs
+- * the hardware lock when it changes the window state, this routine will
+- * automatically be called after such a change.
+- */
+-void radeonGetLock(radeonContextPtr rmesa, GLuint flags)
+-{
+-	__DRIdrawablePrivate *const drawable = rmesa->dri.drawable;
+-	__DRIdrawablePrivate *const readable = rmesa->dri.readable;
+-	__DRIscreenPrivate *sPriv = rmesa->dri.screen;
+-	drm_radeon_sarea_t *sarea = rmesa->sarea;
+-	r300ContextPtr r300 = (r300ContextPtr) rmesa;
+-
+-	assert(drawable != NULL);
+-
+-	drmGetLock(rmesa->dri.fd, rmesa->dri.hwContext, flags);
+-
+-	/* The window might have moved, so we might need to get new clip
+-	 * rects.
+-	 *
+-	 * NOTE: This releases and regrabs the hw lock to allow the X server
+-	 * to respond to the DRI protocol request for new drawable info.
+-	 * Since the hardware state depends on having the latest drawable
+-	 * clip rects, all state checking must be done _after_ this call.
+-	 */
+-	DRI_VALIDATE_DRAWABLE_INFO(sPriv, drawable);
+-	if (drawable != readable) {
+-		DRI_VALIDATE_DRAWABLE_INFO(sPriv, readable);
+-	}
+-
+-	if (rmesa->lastStamp != drawable->lastStamp) {
+-		radeonUpdatePageFlipping(rmesa);
+-		radeonSetCliprects(rmesa);
+-		r300UpdateViewportOffset(rmesa->glCtx);
+-		driUpdateFramebufferSize(rmesa->glCtx, drawable);
+-	}
+-
+-	if (sarea->ctx_owner != rmesa->dri.hwContext) {
+-		int i;
+-
+-		sarea->ctx_owner = rmesa->dri.hwContext;
+-		for (i = 0; i < r300->nr_heaps; i++) {
+-			DRI_AGE_TEXTURES(r300->texture_heaps[i]);
+-		}
+-	}
+-
+-	rmesa->lost_context = GL_TRUE;
+-}
+diff --git a/src/mesa/drivers/dri/r300/radeon_lock.c b/src/mesa/drivers/dri/r300/radeon_lock.c
+new file mode 120000
+index 4f47afd..af4108a
+--- /dev/null
++++ b/src/mesa/drivers/dri/r300/radeon_lock.c
+@@ -0,0 +1 @@
++../radeon/radeon_lock.c
+\ No newline at end of file
+diff --git a/src/mesa/drivers/dri/r300/radeon_lock.h b/src/mesa/drivers/dri/r300/radeon_lock.h
+deleted file mode 100644
+index a344837..64bdf94
+--- a/src/mesa/drivers/dri/r300/radeon_lock.h
++++ /dev/null
+@@ -1,115 +0,0 @@
+-/**************************************************************************
+-
+-Copyright 2000, 2001 ATI Technologies Inc., Ontario, Canada, and
+-                     VA Linux Systems Inc., Fremont, California.
+-Copyright (C) The Weather Channel, Inc.  2002.  All Rights Reserved.
+-
+-The Weather Channel (TM) funded Tungsten Graphics to develop the
+-initial release of the Radeon 8500 driver under the XFree86 license.
+-This notice must be preserved.
+-
+-All Rights Reserved.
+-
+-Permission is hereby granted, free of charge, to any person obtaining
+-a copy of this software and associated documentation files (the
+-"Software"), to deal in the Software without restriction, including
+-without limitation the rights to use, copy, modify, merge, publish,
+-distribute, sublicense, and/or sell copies of the Software, and to
+-permit persons to whom the Software is furnished to do so, subject to
+-the following conditions:
+-
+-The above copyright notice and this permission notice (including the
+-next paragraph) shall be included in all copies or substantial
+-portions of the Software.
+-
+-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+-EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+-MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+-IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+-LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+-OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+-WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+-
+-**************************************************************************/
+-
+-/*
+- * Authors:
+- *   Gareth Hughes <gareth@valinux.com>
+- *   Keith Whitwell <keith@tungstengraphics.com>
+- *   Kevin E. Martin <martin@valinux.com>
+- */
+-
+-#ifndef __RADEON_LOCK_H__
+-#define __RADEON_LOCK_H__
+-
+-#include "radeon_context.h"
+-
+-extern void radeonGetLock(radeonContextPtr rmesa, GLuint flags);
+-extern void radeonUpdatePageFlipping(radeonContextPtr rmesa);
+-
+-/* Turn DEBUG_LOCKING on to find locking conflicts.
+- */
+-#define DEBUG_LOCKING	0
+-
+-#if DEBUG_LOCKING
+-extern char *prevLockFile;
+-extern int prevLockLine;
+-
+-#define DEBUG_LOCK()							\
+-   do {									\
+-      prevLockFile = (__FILE__);					\
+-      prevLockLine = (__LINE__);					\
+-   } while (0)
+-
+-#define DEBUG_RESET()							\
+-   do {									\
+-      prevLockFile = 0;							\
+-      prevLockLine = 0;							\
+-   } while (0)
+-
+-#define DEBUG_CHECK_LOCK()						\
+-   do {									\
+-      if (prevLockFile) {						\
+-	 fprintf(stderr,						\
+-		  "LOCK SET!\n\tPrevious %s:%d\n\tCurrent: %s:%d\n",	\
+-		  prevLockFile, prevLockLine, __FILE__, __LINE__);	\
+-	 exit(1);							\
+-      }									\
+-   } while (0)
+-
+-#else
+-
+-#define DEBUG_LOCK()
+-#define DEBUG_RESET()
+-#define DEBUG_CHECK_LOCK()
+-
+-#endif
+-
+-/*
+- * !!! We may want to separate locks from locks with validation.  This
+- * could be used to improve performance for those things commands that
+- * do not do any drawing !!!
+- */
+-
+-/* Lock the hardware and validate our state.
+- */
+-#define LOCK_HARDWARE( rmesa )						\
+-	do {								\
+-		char __ret = 0;						\
+-		DEBUG_CHECK_LOCK();					\
+-		DRM_CAS((rmesa)->dri.hwLock, (rmesa)->dri.hwContext,	\
+-			(DRM_LOCK_HELD | (rmesa)->dri.hwContext), __ret); \
+-		if (__ret)						\
+-			radeonGetLock((rmesa), 0);			\
+-		DEBUG_LOCK();						\
+-	} while (0)
+-
+-#define UNLOCK_HARDWARE( rmesa )					\
+-	do {								\
+-		DRM_UNLOCK((rmesa)->dri.fd,				\
+-			(rmesa)->dri.hwLock,				\
+-			(rmesa)->dri.hwContext);			\
+-		DEBUG_RESET();						\
+-	} while (0)
+-
+-#endif				/* __RADEON_LOCK_H__ */
+diff --git a/src/mesa/drivers/dri/r300/radeon_lock.h b/src/mesa/drivers/dri/r300/radeon_lock.h
+new file mode 120000
+index a344837..64bdf94
+--- /dev/null
++++ b/src/mesa/drivers/dri/r300/radeon_lock.h
+@@ -0,0 +1 @@
++../radeon/radeon_lock.h
+\ No newline at end of file
+diff --git a/src/mesa/drivers/dri/r300/radeon_program_pair.c b/src/mesa/drivers/dri/r300/radeon_program_pair.c
+index 58bc0d5..8a945d8 100644
+--- a/src/mesa/drivers/dri/r300/radeon_program_pair.c
++++ b/src/mesa/drivers/dri/r300/radeon_program_pair.c
+@@ -35,7 +35,7 @@
+ 
+ #include "radeon_program_pair.h"
+ 
+-#include "radeon_context.h"
++#include "radeon_common.h"
+ 
+ #include "shader/prog_print.h"
+ 
+diff --git a/src/mesa/drivers/dri/r300/radeon_span.c b/src/mesa/drivers/dri/r300/radeon_span.c
+deleted file mode 100644
+index 16f9fb9..232868c
+--- a/src/mesa/drivers/dri/r300/radeon_span.c
++++ /dev/null
+@@ -1,349 +0,0 @@
+-/**************************************************************************
+-
+-Copyright (C) The Weather Channel, Inc.  2002.  All Rights Reserved.
+-Copyright 2000, 2001 ATI Technologies Inc., Ontario, Canada, and
+-                     VA Linux Systems Inc., Fremont, California.
+-
+-The Weather Channel (TM) funded Tungsten Graphics to develop the
+-initial release of the Radeon 8500 driver under the XFree86 license.
+-This notice must be preserved.
+-
+-All Rights Reserved.
+-
+-Permission is hereby granted, free of charge, to any person obtaining
+-a copy of this software and associated documentation files (the
+-"Software"), to deal in the Software without restriction, including
+-without limitation the rights to use, copy, modify, merge, publish,
+-distribute, sublicense, and/or sell copies of the Software, and to
+-permit persons to whom the Software is furnished to do so, subject to
+-the following conditions:
+-
+-The above copyright notice and this permission notice (including the
+-next paragraph) shall be included in all copies or substantial
+-portions of the Software.
+-
+-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+-EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+-MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+-IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+-LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+-OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+-WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+-
+-**************************************************************************/
+-
+-/*
+- * Authors:
+- *   Kevin E. Martin <martin@valinux.com>
+- *   Gareth Hughes <gareth@valinux.com>
+- *   Keith Whitwell <keith@tungstengraphics.com>
+- *
+- */
+-
+-#include "main/glheader.h"
+-#include "swrast/swrast.h"
+-
+-#include "r300_state.h"
+-#include "radeon_ioctl.h"
+-#include "r300_ioctl.h"
+-#include "radeon_span.h"
+-
+-#include "drirenderbuffer.h"
+-
+-#define DBG 0
+-
+-/*
+- * Note that all information needed to access pixels in a renderbuffer
+- * should be obtained through the gl_renderbuffer parameter, not per-context
+- * information.
+- */
+-#define LOCAL_VARS						\
+-   driRenderbuffer *drb = (driRenderbuffer *) rb;		\
+-   const __DRIdrawablePrivate *dPriv = drb->dPriv;		\
+-   const GLuint bottom = dPriv->h - 1;				\
+-   GLubyte *buf = (GLubyte *) drb->flippedData			\
+-      + (dPriv->y * drb->flippedPitch + dPriv->x) * drb->cpp;	\
+-   GLuint p;							\
+-   (void) p;
+-
+-#define LOCAL_DEPTH_VARS				\
+-   driRenderbuffer *drb = (driRenderbuffer *) rb;	\
+-   const __DRIdrawablePrivate *dPriv = drb->dPriv;	\
+-   const GLuint bottom = dPriv->h - 1;			\
+-   GLuint xo = dPriv->x;				\
+-   GLuint yo = dPriv->y;				\
+-   GLubyte *buf = (GLubyte *) drb->Base.Data;
+-
+-#define LOCAL_STENCIL_VARS LOCAL_DEPTH_VARS
+-
+-#define Y_FLIP(Y) (bottom - (Y))
+-
+-#define HW_LOCK()
+-
+-#define HW_UNLOCK()
+-
+-/* ================================================================
+- * Color buffer
+- */
+-
+-/* 16 bit, RGB565 color spanline and pixel functions
+- */
+-#define SPANTMP_PIXEL_FMT GL_RGB
+-#define SPANTMP_PIXEL_TYPE GL_UNSIGNED_SHORT_5_6_5
+-
+-#define TAG(x)    radeon##x##_RGB565
+-#define TAG2(x,y) radeon##x##_RGB565##y
+-#define GET_PTR(X,Y) (buf + ((Y) * drb->flippedPitch + (X)) * 2)
+-#include "spantmp2.h"
+-
+-/* 32 bit, ARGB8888 color spanline and pixel functions
+- */
+-#define SPANTMP_PIXEL_FMT GL_BGRA
+-#define SPANTMP_PIXEL_TYPE GL_UNSIGNED_INT_8_8_8_8_REV
+-
+-#define TAG(x)    radeon##x##_ARGB8888
+-#define TAG2(x,y) radeon##x##_ARGB8888##y
+-#define GET_PTR(X,Y) (buf + ((Y) * drb->flippedPitch + (X)) * 4)
+-#include "spantmp2.h"
+-
+-/* ================================================================
+- * Depth buffer
+- */
+-
+-/* The Radeon family has depth tiling on all the time, so we have to convert
+- * the x,y coordinates into the memory bus address (mba) in the same
+- * manner as the engine.  In each case, the linear block address (ba)
+- * is calculated, and then wired with x and y to produce the final
+- * memory address.
+- * The chip will do address translation on its own if the surface registers
+- * are set up correctly. It is not quite enough to get it working with hyperz
+- * too...
+- */
+-
+-static GLuint radeon_mba_z32(const driRenderbuffer * drb, GLint x, GLint y)
+-{
+-	GLuint pitch = drb->pitch;
+-	if (drb->depthHasSurface) {
+-		return 4 * (x + y * pitch);
+-	} else {
+-		GLuint ba, address = 0;	/* a[0..1] = 0           */
+-
+-#ifdef COMPILE_R300
+-		ba = (y / 8) * (pitch / 8) + (x / 8);
+-#else
+-		ba = (y / 16) * (pitch / 16) + (x / 16);
+-#endif
+-
+-		address |= (x & 0x7) << 2;	/* a[2..4] = x[0..2]     */
+-		address |= (y & 0x3) << 5;	/* a[5..6] = y[0..1]     */
+-		address |= (((x & 0x10) >> 2) ^ (y & 0x4)) << 5;	/* a[7]    = x[4] ^ y[2] */
+-		address |= (ba & 0x3) << 8;	/* a[8..9] = ba[0..1]    */
+-
+-		address |= (y & 0x8) << 7;	/* a[10]   = y[3]        */
+-		address |= (((x & 0x8) << 1) ^ (y & 0x10)) << 7;	/* a[11]   = x[3] ^ y[4] */
+-		address |= (ba & ~0x3) << 10;	/* a[12..] = ba[2..]     */
+-
+-		return address;
+-	}
+-}
+-
+-static INLINE GLuint
+-radeon_mba_z16(const driRenderbuffer * drb, GLint x, GLint y)
+-{
+-	GLuint pitch = drb->pitch;
+-	if (drb->depthHasSurface) {
+-		return 2 * (x + y * pitch);
+-	} else {
+-		GLuint ba, address = 0;	/* a[0]    = 0           */
+-
+-		ba = (y / 16) * (pitch / 32) + (x / 32);
+-
+-		address |= (x & 0x7) << 1;	/* a[1..3] = x[0..2]     */
+-		address |= (y & 0x7) << 4;	/* a[4..6] = y[0..2]     */
+-		address |= (x & 0x8) << 4;	/* a[7]    = x[3]        */
+-		address |= (ba & 0x3) << 8;	/* a[8..9] = ba[0..1]    */
+-		address |= (y & 0x8) << 7;	/* a[10]   = y[3]        */
+-		address |= ((x & 0x10) ^ (y & 0x10)) << 7;	/* a[11]   = x[4] ^ y[4] */
+-		address |= (ba & ~0x3) << 10;	/* a[12..] = ba[2..]     */
+-
+-		return address;
+-	}
+-}
+-
+-/* 16-bit depth buffer functions
+- */
+-#define VALUE_TYPE GLushort
+-
+-#define WRITE_DEPTH( _x, _y, d )					\
+-   *(GLushort *)(buf + radeon_mba_z16( drb, _x + xo, _y + yo )) = d;
+-
+-#define READ_DEPTH( d, _x, _y )						\
+-   d = *(GLushort *)(buf + radeon_mba_z16( drb, _x + xo, _y + yo ));
+-
+-#define TAG(x) radeon##x##_z16
+-#include "depthtmp.h"
+-
+-/* 24 bit depth, 8 bit stencil depthbuffer functions
+- *
+- * Careful: It looks like the R300 uses ZZZS byte order while the R200
+- * uses SZZZ for 24 bit depth, 8 bit stencil mode.
+- */
+-#define VALUE_TYPE GLuint
+-
+-#ifdef COMPILE_R300
+-#define WRITE_DEPTH( _x, _y, d )					\
+-do {									\
+-   GLuint offset = radeon_mba_z32( drb, _x + xo, _y + yo );		\
+-   GLuint tmp = *(GLuint *)(buf + offset);				\
+-   tmp &= 0x000000ff;							\
+-   tmp |= ((d << 8) & 0xffffff00);					\
+-   *(GLuint *)(buf + offset) = tmp;					\
+-} while (0)
+-#else
+-#define WRITE_DEPTH( _x, _y, d )					\
+-do {									\
+-   GLuint offset = radeon_mba_z32( drb, _x + xo, _y + yo );		\
+-   GLuint tmp = *(GLuint *)(buf + offset);				\
+-   tmp &= 0xff000000;							\
+-   tmp |= ((d) & 0x00ffffff);						\
+-   *(GLuint *)(buf + offset) = tmp;					\
+-} while (0)
+-#endif
+-
+-#ifdef COMPILE_R300
+-#define READ_DEPTH( d, _x, _y )						\
+-  do { \
+-    d = (*(GLuint *)(buf + radeon_mba_z32( drb, _x + xo,		\
+-					 _y + yo )) & 0xffffff00) >> 8; \
+-  }while(0)
+-#else
+-#define READ_DEPTH( d, _x, _y )						\
+-   d = *(GLuint *)(buf + radeon_mba_z32( drb, _x + xo,			\
+-					 _y + yo )) & 0x00ffffff;
+-#endif
+-
+-#define TAG(x) radeon##x##_z24_s8
+-#include "depthtmp.h"
+-
+-/* ================================================================
+- * Stencil buffer
+- */
+-
+-/* 24 bit depth, 8 bit stencil depthbuffer functions
+- */
+-#ifdef COMPILE_R300
+-#define WRITE_STENCIL( _x, _y, d )					\
+-do {									\
+-   GLuint offset = radeon_mba_z32( drb, _x + xo, _y + yo );		\
+-   GLuint tmp = *(GLuint *)(buf + offset);				\
+-   tmp &= 0xffffff00;							\
+-   tmp |= (d) & 0xff;							\
+-   *(GLuint *)(buf + offset) = tmp;					\
+-} while (0)
+-#else
+-#define WRITE_STENCIL( _x, _y, d )					\
+-do {									\
+-   GLuint offset = radeon_mba_z32( drb, _x + xo, _y + yo );		\
+-   GLuint tmp = *(GLuint *)(buf + offset);				\
+-   tmp &= 0x00ffffff;							\
+-   tmp |= (((d) & 0xff) << 24);						\
+-   *(GLuint *)(buf + offset) = tmp;					\
+-} while (0)
+-#endif
+-
+-#ifdef COMPILE_R300
+-#define READ_STENCIL( d, _x, _y )					\
+-do {									\
+-   GLuint offset = radeon_mba_z32( drb, _x + xo, _y + yo );		\
+-   GLuint tmp = *(GLuint *)(buf + offset);				\
+-   d = tmp & 0x000000ff;						\
+-} while (0)
+-#else
+-#define READ_STENCIL( d, _x, _y )					\
+-do {									\
+-   GLuint offset = radeon_mba_z32( drb, _x + xo, _y + yo );		\
+-   GLuint tmp = *(GLuint *)(buf + offset);				\
+-   d = (tmp & 0xff000000) >> 24;					\
+-} while (0)
+-#endif
+-
+-#define TAG(x) radeon##x##_z24_s8
+-#include "stenciltmp.h"
+-
+-/* Move locking out to get reasonable span performance (10x better
+- * than doing this in HW_LOCK above).  WaitForIdle() is the main
+- * culprit.
+- */
+-
+-static void radeonSpanRenderStart(GLcontext * ctx)
+-{
+-	radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+-#ifdef COMPILE_R300
+-	r300ContextPtr r300 = (r300ContextPtr) rmesa;
+-	R300_FIREVERTICES(r300);
+-#else
+-	RADEON_FIREVERTICES(rmesa);
+-#endif
+-	LOCK_HARDWARE(rmesa);
+-	radeonWaitForIdleLocked(rmesa);
+-
+-	/* Read the first pixel in the frame buffer.  This should
+-	 * be a noop, right?  In fact without this conform fails as reading
+-	 * from the framebuffer sometimes produces old results -- the
+-	 * on-card read cache gets mixed up and doesn't notice that the
+-	 * framebuffer has been updated.
+-	 *
+-	 * Note that we should probably be reading some otherwise unused
+-	 * region of VRAM, otherwise we might get incorrect results when
+-	 * reading pixels from the top left of the screen.
+-	 *
+-	 * I found this problem on an R420 with glean's texCube test.
+-	 * Note that the R200 span code also *writes* the first pixel in the
+-	 * framebuffer, but I've found this to be unnecessary.
+-	 *  -- Nicolai Hähnle, June 2008
+-	 */
+-	{
+-		int p;
+-		driRenderbuffer *drb =
+-			(driRenderbuffer *) ctx->WinSysDrawBuffer->_ColorDrawBuffers[0];
+-		volatile int *buf =
+-			(volatile int *)(rmesa->dri.screen->pFB + drb->offset);
+-		p = *buf;
+-	}
+-}
+-
+-static void radeonSpanRenderFinish(GLcontext * ctx)
+-{
+-	radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+-	_swrast_flush(ctx);
+-	UNLOCK_HARDWARE(rmesa);
+-}
+-
+-void radeonInitSpanFuncs(GLcontext * ctx)
+-{
+-	struct swrast_device_driver *swdd =
+-	    _swrast_GetDeviceDriverReference(ctx);
+-	swdd->SpanRenderStart = radeonSpanRenderStart;
+-	swdd->SpanRenderFinish = radeonSpanRenderFinish;
+-}
+-
+-/**
+- * Plug in the Get/Put routines for the given driRenderbuffer.
+- */
+-void radeonSetSpanFunctions(driRenderbuffer * drb, const GLvisual * vis)
+-{
+-	if (drb->Base.InternalFormat == GL_RGBA) {
+-		if (vis->redBits == 5 && vis->greenBits == 6
+-		    && vis->blueBits == 5) {
+-			radeonInitPointers_RGB565(&drb->Base);
+-		} else {
+-			radeonInitPointers_ARGB8888(&drb->Base);
+-		}
+-	} else if (drb->Base.InternalFormat == GL_DEPTH_COMPONENT16) {
+-		radeonInitDepthPointers_z16(&drb->Base);
+-	} else if (drb->Base.InternalFormat == GL_DEPTH_COMPONENT24) {
+-		radeonInitDepthPointers_z24_s8(&drb->Base);
+-	} else if (drb->Base.InternalFormat == GL_STENCIL_INDEX8_EXT) {
+-		radeonInitStencilPointers_z24_s8(&drb->Base);
+-	}
+-}
+diff --git a/src/mesa/drivers/dri/r300/radeon_span.c b/src/mesa/drivers/dri/r300/radeon_span.c
+new file mode 120000
+index 16f9fb9..232868c
+--- /dev/null
++++ b/src/mesa/drivers/dri/r300/radeon_span.c
+@@ -0,0 +1 @@
++../radeon/radeon_span.c
+\ No newline at end of file
+diff --git a/src/mesa/drivers/dri/r300/radeon_state.c b/src/mesa/drivers/dri/r300/radeon_state.c
+deleted file mode 100644
+index c401da6..0000000
+--- a/src/mesa/drivers/dri/r300/radeon_state.c
++++ /dev/null
+@@ -1,244 +0,0 @@
+-/**************************************************************************
+-
+-Copyright (C) The Weather Channel, Inc.  2002.  All Rights Reserved.
+-
+-The Weather Channel (TM) funded Tungsten Graphics to develop the
+-initial release of the Radeon 8500 driver under the XFree86 license.
+-This notice must be preserved.
+-
+-Permission is hereby granted, free of charge, to any person obtaining
+-a copy of this software and associated documentation files (the
+-"Software"), to deal in the Software without restriction, including
+-without limitation the rights to use, copy, modify, merge, publish,
+-distribute, sublicense, and/or sell copies of the Software, and to
+-permit persons to whom the Software is furnished to do so, subject to
+-the following conditions:
+-
+-The above copyright notice and this permission notice (including the
+-next paragraph) shall be included in all copies or substantial
+-portions of the Software.
+-
+-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+-EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+-MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+-IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+-LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+-OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+-WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+-
+-**************************************************************************/
+-
+-/*
+- * Authors:
+- *   Keith Whitwell <keith@tungstengraphics.com>
+- */
+-
+-#include "main/glheader.h"
+-#include "main/imports.h"
+-#include "main/api_arrayelt.h"
+-#include "main/enums.h"
+-#include "main/framebuffer.h"
+-#include "main/colormac.h"
+-#include "main/light.h"
+-
+-#include "swrast/swrast.h"
+-#include "vbo/vbo.h"
+-#include "tnl/tnl.h"
+-#include "tnl/t_pipeline.h"
+-#include "swrast_setup/swrast_setup.h"
+-
+-#include "radeon_ioctl.h"
+-#include "radeon_state.h"
+-#include "r300_ioctl.h"
+-
+-
+-/* =============================================================
+- * Scissoring
+- */
+-
+-static GLboolean intersect_rect(drm_clip_rect_t * out,
+-				drm_clip_rect_t * a, drm_clip_rect_t * b)
+-{
+-	*out = *a;
+-	if (b->x1 > out->x1)
+-		out->x1 = b->x1;
+-	if (b->y1 > out->y1)
+-		out->y1 = b->y1;
+-	if (b->x2 < out->x2)
+-		out->x2 = b->x2;
+-	if (b->y2 < out->y2)
+-		out->y2 = b->y2;
+-	if (out->x1 >= out->x2)
+-		return GL_FALSE;
+-	if (out->y1 >= out->y2)
+-		return GL_FALSE;
+-	return GL_TRUE;
+-}
+-
+-void radeonRecalcScissorRects(radeonContextPtr radeon)
+-{
+-	drm_clip_rect_t *out;
+-	int i;
+-
+-	/* Grow cliprect store?
+-	 */
+-	if (radeon->state.scissor.numAllocedClipRects < radeon->numClipRects) {
+-		while (radeon->state.scissor.numAllocedClipRects <
+-		       radeon->numClipRects) {
+-			radeon->state.scissor.numAllocedClipRects += 1;	/* zero case */
+-			radeon->state.scissor.numAllocedClipRects *= 2;
+-		}
+-
+-		if (radeon->state.scissor.pClipRects)
+-			FREE(radeon->state.scissor.pClipRects);
+-
+-		radeon->state.scissor.pClipRects =
+-		    MALLOC(radeon->state.scissor.numAllocedClipRects *
+-			   sizeof(drm_clip_rect_t));
+-
+-		if (radeon->state.scissor.pClipRects == NULL) {
+-			radeon->state.scissor.numAllocedClipRects = 0;
+-			return;
+-		}
+-	}
+-
+-	out = radeon->state.scissor.pClipRects;
+-	radeon->state.scissor.numClipRects = 0;
+-
+-	for (i = 0; i < radeon->numClipRects; i++) {
+-		if (intersect_rect(out,
+-				   &radeon->pClipRects[i],
+-				   &radeon->state.scissor.rect)) {
+-			radeon->state.scissor.numClipRects++;
+-			out++;
+-		}
+-	}
+-}
+-
+-void radeonUpdateScissor(GLcontext* ctx)
+-{
+-	radeonContextPtr radeon = RADEON_CONTEXT(ctx);
+-
+-	if (radeon->dri.drawable) {
+-		__DRIdrawablePrivate *dPriv = radeon->dri.drawable;
+-		int x1 = dPriv->x + ctx->Scissor.X;
+-		int y1 = dPriv->y + dPriv->h - (ctx->Scissor.Y + ctx->Scissor.Height);
+-
+-		radeon->state.scissor.rect.x1 = x1;
+-		radeon->state.scissor.rect.y1 = y1;
+-		radeon->state.scissor.rect.x2 = x1 + ctx->Scissor.Width;
+-		radeon->state.scissor.rect.y2 = y1 + ctx->Scissor.Height;
+-
+-		radeonRecalcScissorRects(radeon);
+-	}
+-}
+-
+-static void radeonScissor(GLcontext* ctx, GLint x, GLint y, GLsizei w, GLsizei h)
+-{
+-	if (ctx->Scissor.Enabled) {
+-		/* We don't pipeline cliprect changes */
+-		r300Flush(ctx);
+-		radeonUpdateScissor(ctx);
+-	}
+-}
+-
+-
+-/**
+- * Update cliprects and scissors.
+- */
+-void radeonSetCliprects(radeonContextPtr radeon)
+-{
+-	__DRIdrawablePrivate *const drawable = radeon->dri.drawable;
+-	__DRIdrawablePrivate *const readable = radeon->dri.readable;
+-	GLframebuffer *const draw_fb = (GLframebuffer*)drawable->driverPrivate;
+-	GLframebuffer *const read_fb = (GLframebuffer*)readable->driverPrivate;
+-
+-	if (draw_fb->_ColorDrawBufferIndexes[0] == BUFFER_BACK_LEFT) {
+-		/* Can't ignore 2d windows if we are page flipping. */
+-		if (drawable->numBackClipRects == 0 || radeon->doPageFlip ||
+-		    radeon->sarea->pfCurrentPage == 1) {
+-			radeon->numClipRects = drawable->numClipRects;
+-			radeon->pClipRects = drawable->pClipRects;
+-		} else {
+-			radeon->numClipRects = drawable->numBackClipRects;
+-			radeon->pClipRects = drawable->pBackClipRects;
+-		}
+-	} else {
+-		/* front buffer (or none, or multiple buffers */
+-		radeon->numClipRects = drawable->numClipRects;
+-		radeon->pClipRects = drawable->pClipRects;
+-	}
+-
+-	if ((draw_fb->Width != drawable->w) ||
+-	    (draw_fb->Height != drawable->h)) {
+-		_mesa_resize_framebuffer(radeon->glCtx, draw_fb,
+-					 drawable->w, drawable->h);
+-		draw_fb->Initialized = GL_TRUE;
+-	}
+-
+-	if (drawable != readable) {
+-		if ((read_fb->Width != readable->w) ||
+-		    (read_fb->Height != readable->h)) {
+-			_mesa_resize_framebuffer(radeon->glCtx, read_fb,
+-						 readable->w, readable->h);
+-			read_fb->Initialized = GL_TRUE;
+-		}
+-	}
+-
+-	if (radeon->state.scissor.enabled)
+-		radeonRecalcScissorRects(radeon);
+-
+-	radeon->lastStamp = drawable->lastStamp;
+-}
+-
+-
+-/**
+- * Handle common enable bits.
+- * Called as a fallback by r200Enable/r300Enable.
+- */
+-void radeonEnable(GLcontext* ctx, GLenum cap, GLboolean state)
+-{
+-	radeonContextPtr radeon = RADEON_CONTEXT(ctx);
+-
+-	switch(cap) {
+-	case GL_SCISSOR_TEST:
+-		/* We don't pipeline cliprect & scissor changes */
+-		r300Flush(ctx);
+-
+-		radeon->state.scissor.enabled = state;
+-		radeonUpdateScissor(ctx);
+-		break;
+-
+-	default:
+-		return;
+-	}
+-}
+-
+-
+-/**
+- * Initialize default state.
+- * This function is called once at context init time from
+- * r200InitState/r300InitState
+- */
+-void radeonInitState(radeonContextPtr radeon)
+-{
+-	radeon->Fallback = 0;
+-
+-	if (radeon->glCtx->Visual.doubleBufferMode && radeon->sarea->pfCurrentPage == 0) {
+-		radeon->state.color.drawOffset = radeon->radeonScreen->backOffset;
+-		radeon->state.color.drawPitch = radeon->radeonScreen->backPitch;
+-	} else {
+-		radeon->state.color.drawOffset = radeon->radeonScreen->frontOffset;
+-		radeon->state.color.drawPitch = radeon->radeonScreen->frontPitch;
+-	}
+-}
+-
+-
+-/**
+- * Initialize common state functions.
+- * Called by r200InitStateFuncs/r300InitStateFuncs
+- */
+-void radeonInitStateFuncs(struct dd_function_table *functions)
+-{
+-	functions->Scissor = radeonScissor;
+-}
+diff --git a/src/mesa/drivers/dri/r300/radeon_state.h b/src/mesa/drivers/dri/r300/radeon_state.h
+deleted file mode 100644
+index 821cb40..0000000
+--- a/src/mesa/drivers/dri/r300/radeon_state.h
++++ /dev/null
+@@ -1,43 +0,0 @@
+-/*
+-Copyright (C) 2004 Nicolai Haehnle.  All Rights Reserved.
+-
+-Permission is hereby granted, free of charge, to any person obtaining
+-a copy of this software and associated documentation files (the
+-"Software"), to deal in the Software without restriction, including
+-without limitation the rights to use, copy, modify, merge, publish,
+-distribute, sublicense, and/or sell copies of the Software, and to
+-permit persons to whom the Software is furnished to do so, subject to
+-the following conditions:
+-
+-The above copyright notice and this permission notice (including the
+-next paragraph) shall be included in all copies or substantial
+-portions of the Software.
+-
+-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+-EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+-MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+-IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+-LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+-OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+-WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+-
+-**************************************************************************/
+-
+-/*
+- * Authors:
+- *   Nicolai Haehnle <prefect_@gmx.net>
+- */
+-
+-#ifndef __RADEON_STATE_H__
+-#define __RADEON_STATE_H__
+-
+-extern void radeonRecalcScissorRects(radeonContextPtr radeon);
+-extern void radeonSetCliprects(radeonContextPtr radeon);
+-extern void radeonUpdateScissor(GLcontext* ctx);
+-
+-extern void radeonEnable(GLcontext* ctx, GLenum cap, GLboolean state);
+-
+-extern void radeonInitState(radeonContextPtr radeon);
+-extern void radeonInitStateFuncs(struct dd_function_table* functions);
+-
+-#endif
+diff --git a/src/mesa/drivers/dri/radeon/Makefile b/src/mesa/drivers/dri/radeon/Makefile
+index f223b2d..f469c6f 100644
+--- a/src/mesa/drivers/dri/radeon/Makefile
++++ b/src/mesa/drivers/dri/radeon/Makefile
+@@ -4,25 +4,36 @@
+ TOP = ../../../../..
+ include $(TOP)/configs/current
+ 
++CFLAGS += $(RADEON_CFLAGS)
++
+ LIBNAME = radeon_dri.so
+ 
+ MINIGLX_SOURCES = server/radeon_dri.c 
+ 
++RADEON_COMMON_SOURCES = \
++	radeon_texture.c \
++	radeon_common_context.c \
++	radeon_common.c \
++	radeon_dma.c \
++	radeon_lock.c \
++	radeon_bo_legacy.c \
++	radeon_cs_legacy.c \
++	radeon_mipmap_tree.c \
++	radeon_span.c
++
+ DRIVER_SOURCES = \
+ 	radeon_context.c \
+ 	radeon_ioctl.c \
+-	radeon_lock.c \
+ 	radeon_screen.c \
+ 	radeon_state.c \
+ 	radeon_state_init.c \
+ 	radeon_tex.c \
+-	radeon_texmem.c \
+ 	radeon_texstate.c \
+ 	radeon_tcl.c \
+ 	radeon_swtcl.c \
+-	radeon_span.c \
+ 	radeon_maos.c \
+-	radeon_sanity.c 
++	radeon_sanity.c \
++	$(RADEON_COMMON_SOURCES)
+ 
+ C_SOURCES = \
+ 	$(COMMON_SOURCES) \
+@@ -30,6 +41,8 @@ C_SOURCES = \
+ 
+ DRIVER_DEFINES = -DRADEON_COMMON=0
+ 
++DRI_LIB_DEPS += $(RADEON_LDFLAGS)
++
+ X86_SOURCES = 
+ 
+ include ../Makefile.template
+diff --git a/src/mesa/drivers/dri/radeon/radeon_bo_drm.h b/src/mesa/drivers/dri/radeon/radeon_bo_drm.h
+new file mode 100644
+index 0000000..1ed13f1
+--- /dev/null
++++ b/src/mesa/drivers/dri/radeon/radeon_bo_drm.h
+@@ -0,0 +1,182 @@
++/* 
++ * Copyright © 2008 Jérôme Glisse
++ * All Rights Reserved.
++ * 
++ * Permission is hereby granted, free of charge, to any person obtaining
++ * a copy of this software and associated documentation files (the
++ * "Software"), to deal in the Software without restriction, including
++ * without limitation the rights to use, copy, modify, merge, publish,
++ * distribute, sub license, and/or sell copies of the Software, and to
++ * permit persons to whom the Software is furnished to do so, subject to
++ * the following conditions:
++ * 
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
++ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
++ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
++ * NON-INFRINGEMENT. IN NO EVENT SHALL THE COPYRIGHT HOLDERS, AUTHORS
++ * AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
++ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE 
++ * USE OR OTHER DEALINGS IN THE SOFTWARE.
++ *
++ * The above copyright notice and this permission notice (including the
++ * next paragraph) shall be included in all copies or substantial portions
++ * of the Software.
++ */
++/*
++ * Authors:
++ *      Jérôme Glisse <glisse@freedesktop.org>
++ */
++#ifndef RADEON_BO_H
++#define RADEON_BO_H
++
++#include <stdio.h>
++#include <stdint.h>
++//#include "radeon_track.h"
++
++/* bo object */
++#define RADEON_BO_FLAGS_MACRO_TILE  1
++#define RADEON_BO_FLAGS_MICRO_TILE  2
++
++struct radeon_bo_manager;
++
++struct radeon_bo {
++    uint32_t                    alignment;
++    uint32_t                    handle;
++    uint32_t                    size;
++    uint32_t                    domains;
++    uint32_t                    flags;
++    unsigned                    cref;
++#ifdef RADEON_BO_TRACK
++    struct radeon_track         *track;
++#endif
++    void                        *ptr;
++    struct radeon_bo_manager    *bom;
++    uint32_t                    space_accounted;
++};
++
++/* bo functions */
++struct radeon_bo_funcs {
++    struct radeon_bo *(*bo_open)(struct radeon_bo_manager *bom,
++                                 uint32_t handle,
++                                 uint32_t size,
++                                 uint32_t alignment,
++                                 uint32_t domains,
++                                 uint32_t flags);
++    void (*bo_ref)(struct radeon_bo *bo);
++    struct radeon_bo *(*bo_unref)(struct radeon_bo *bo);
++    int (*bo_map)(struct radeon_bo *bo, int write);
++    int (*bo_unmap)(struct radeon_bo *bo);
++    int (*bo_wait)(struct radeon_bo *bo);
++};
++
++struct radeon_bo_manager {
++    struct radeon_bo_funcs  *funcs;
++    int                     fd;
++
++#ifdef RADEON_BO_TRACK
++    struct radeon_tracker   tracker;
++#endif
++};
++    
++static inline void _radeon_bo_debug(struct radeon_bo *bo,
++                                    const char *op,
++                                    const char *file,
++                                    const char *func,
++                                    int line)
++{
++    fprintf(stderr, "%s %p 0x%08X 0x%08X 0x%08X [%s %s %d]\n",
++            op, bo, bo->handle, bo->size, bo->cref, file, func, line);
++}
++
++static inline struct radeon_bo *_radeon_bo_open(struct radeon_bo_manager *bom,
++                                                uint32_t handle,
++                                                uint32_t size,
++                                                uint32_t alignment,
++                                                uint32_t domains,
++                                                uint32_t flags,
++                                                const char *file,
++                                                const char *func,
++                                                int line)
++{
++    struct radeon_bo *bo;
++
++    bo = bom->funcs->bo_open(bom, handle, size, alignment, domains, flags);
++#ifdef RADEON_BO_TRACK
++    if (bo) {
++        bo->track = radeon_tracker_add_track(&bom->tracker, bo->handle);
++        radeon_track_add_event(bo->track, file, func, "open", line);
++    }
++#endif
++    return bo;
++}
++
++static inline void _radeon_bo_ref(struct radeon_bo *bo,
++                                  const char *file,
++                                  const char *func,
++                                  int line)
++{
++    bo->cref++;
++#ifdef RADEON_BO_TRACK
++    radeon_track_add_event(bo->track, file, func, "ref", line); 
++#endif
++    bo->bom->funcs->bo_ref(bo);
++}
++
++static inline struct radeon_bo *_radeon_bo_unref(struct radeon_bo *bo,
++                                                 const char *file,
++                                                 const char *func,
++                                                 int line)
++{
++    bo->cref--;
++#ifdef RADEON_BO_TRACK
++    radeon_track_add_event(bo->track, file, func, "unref", line);
++    if (bo->cref <= 0) {
++        radeon_tracker_remove_track(&bo->bom->tracker, bo->track);
++        bo->track = NULL;
++    }
++#endif
++    return bo->bom->funcs->bo_unref(bo);
++}
++
++static inline int _radeon_bo_map(struct radeon_bo *bo,
++                                 int write,
++                                 const char *file,
++                                 const char *func,
++                                 int line)
++{
++    return bo->bom->funcs->bo_map(bo, write);
++}
++
++static inline int _radeon_bo_unmap(struct radeon_bo *bo,
++                                   const char *file,
++                                   const char *func,
++                                   int line)
++{
++    return bo->bom->funcs->bo_unmap(bo);
++}
++
++static inline int _radeon_bo_wait(struct radeon_bo *bo,
++                                  const char *file,
++                                  const char *func,
++                                  int line)
++{
++    return bo->bom->funcs->bo_wait(bo);
++}
++
++#define radeon_bo_open(bom, h, s, a, d, f)\
++    _radeon_bo_open(bom, h, s, a, d, f, __FILE__, __FUNCTION__, __LINE__)
++#define radeon_bo_ref(bo)\
++    _radeon_bo_ref(bo, __FILE__, __FUNCTION__, __LINE__)
++#define radeon_bo_unref(bo)\
++    _radeon_bo_unref(bo, __FILE__, __FUNCTION__, __LINE__)
++#define radeon_bo_map(bo, w)\
++    _radeon_bo_map(bo, w, __FILE__, __FUNCTION__, __LINE__)
++#define radeon_bo_unmap(bo)\
++    _radeon_bo_unmap(bo, __FILE__, __FUNCTION__, __LINE__)
++#define radeon_bo_debug(bo, opcode)\
++    _radeon_bo_debug(bo, opcode, __FILE__, __FUNCTION__, __LINE__)
++#define radeon_bo_wait(bo) \
++    _radeon_bo_wait(bo, __FILE__, __func__, __LINE__)
++
++#endif
+diff --git a/src/mesa/drivers/dri/radeon/radeon_bo_legacy.c b/src/mesa/drivers/dri/radeon/radeon_bo_legacy.c
+new file mode 100644
+index 0000000..03a6299
+--- /dev/null
++++ b/src/mesa/drivers/dri/radeon/radeon_bo_legacy.c
+@@ -0,0 +1,825 @@
++/* 
++ * Copyright © 2008 Nicolai Haehnle
++ * Copyright © 2008 Dave Airlie
++ * Copyright © 2008 Jérôme Glisse
++ * All Rights Reserved.
++ * 
++ * Permission is hereby granted, free of charge, to any person obtaining a
++ * copy of this software and associated documentation files (the
++ * "Software"), to deal in the Software without restriction, including
++ * without limitation the rights to use, copy, modify, merge, publish,
++ * distribute, sub license, and/or sell copies of the Software, and to
++ * permit persons to whom the Software is furnished to do so, subject to
++ * the following conditions:
++ * 
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
++ * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
++ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR 
++ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE 
++ * USE OR OTHER DEALINGS IN THE SOFTWARE.
++ *
++ * The above copyright notice and this permission notice (including the
++ * next paragraph) shall be included in all copies or substantial portions
++ * of the Software.
++ */
++/*
++ * Authors:
++ *      Aapo Tahkola <aet@rasterburn.org>
++ *      Nicolai Haehnle <prefect_@gmx.net>
++ *      Dave Airlie
++ *      Jérôme Glisse <glisse@freedesktop.org>
++ */
++#include <stdio.h>
++#include <stddef.h>
++#include <stdint.h>
++#include <stdlib.h>
++#include <string.h>
++#include <errno.h>
++#include <unistd.h>
++#include <sys/mman.h>
++#include <sys/ioctl.h>
++#include "xf86drm.h"
++#include "texmem.h"
++#include "main/simple_list.h"
++
++#include "drm.h"
++#include "radeon_drm.h"
++#include "radeon_common.h"
++#include "radeon_bocs_wrapper.h"
++
++/* no seriously texmem.c is this screwed up */
++struct bo_legacy_texture_object {
++    driTextureObject    base;
++    struct bo_legacy *parent;
++};
++
++struct bo_legacy {
++    struct radeon_bo    base;
++    int                 map_count;
++    uint32_t            pending;
++    int                 is_pending;
++    int                 static_bo;
++    uint32_t            offset;
++    struct bo_legacy_texture_object *tobj;
++    int                 validated;
++    int                 dirty;
++    void                *ptr;
++    struct bo_legacy    *next, *prev;
++    struct bo_legacy    *pnext, *pprev;
++};
++
++struct bo_manager_legacy {
++    struct radeon_bo_manager    base;
++    unsigned                    nhandle;
++    unsigned                    nfree_handles;
++    unsigned                    cfree_handles;
++    uint32_t                    current_age;
++    struct bo_legacy            bos;
++    struct bo_legacy            pending_bos;
++    uint32_t                    fb_location;
++    uint32_t                    texture_offset;
++    unsigned                    dma_alloc_size;
++    uint32_t                    dma_buf_count;
++    unsigned                    cpendings;
++    driTextureObject            texture_swapped;
++    driTexHeap                  *texture_heap;
++    struct radeon_screen        *screen;
++    unsigned                    *free_handles;
++};
++
++static void bo_legacy_tobj_destroy(void *data, driTextureObject *t)
++{
++    struct bo_legacy_texture_object *tobj = (struct bo_legacy_texture_object *)t;
++    
++    if (tobj->parent) {
++        tobj->parent->tobj = NULL;
++        tobj->parent->validated = 0;
++    }
++}
++
++static void inline clean_handles(struct bo_manager_legacy *bom)
++{
++  while (bom->cfree_handles > 0 &&
++	 !bom->free_handles[bom->cfree_handles - 1])
++    bom->cfree_handles--;
++
++}
++static int legacy_new_handle(struct bo_manager_legacy *bom, uint32_t *handle)
++{
++    uint32_t tmp;
++
++    *handle = 0;
++    if (bom->nhandle == 0xFFFFFFFF) {
++        return -EINVAL;
++    }
++    if (bom->cfree_handles > 0) {
++        tmp = bom->free_handles[--bom->cfree_handles];
++	clean_handles(bom);
++    } else {
++        bom->cfree_handles = 0;
++        tmp = bom->nhandle++;
++    }
++    assert(tmp);
++    *handle = tmp;
++    return 0;
++}
++
++static int legacy_free_handle(struct bo_manager_legacy *bom, uint32_t handle)
++{
++    uint32_t *handles;
++
++    if (!handle) {
++        return 0;
++    }
++    if (handle == (bom->nhandle - 1)) {
++        int i;
++
++        bom->nhandle--;
++        for (i = bom->cfree_handles - 1; i >= 0; i--) {
++            if (bom->free_handles[i] == (bom->nhandle - 1)) {
++                bom->nhandle--;
++                bom->free_handles[i] = 0;
++            }
++        }
++        clean_handles(bom);
++        return 0;
++    }
++    if (bom->cfree_handles < bom->nfree_handles) {
++        bom->free_handles[bom->cfree_handles++] = handle;
++        return 0;
++    }
++    bom->nfree_handles += 0x100;
++    handles = (uint32_t*)realloc(bom->free_handles, bom->nfree_handles * 4);
++    if (handles == NULL) {
++        bom->nfree_handles -= 0x100;
++        return -ENOMEM;
++    }
++    bom->free_handles = handles;
++    bom->free_handles[bom->cfree_handles++] = handle;
++    return 0;
++}
++
++static void legacy_get_current_age(struct bo_manager_legacy *boml)
++{
++    drm_radeon_getparam_t gp;
++    int r;
++
++    if (IS_R300_CLASS(boml->screen)) {
++    	gp.param = RADEON_PARAM_LAST_CLEAR;
++    	gp.value = (int *)&boml->current_age;
++    	r = drmCommandWriteRead(boml->base.fd, DRM_RADEON_GETPARAM,
++       	                     &gp, sizeof(gp));
++    	if (r) {
++       	 fprintf(stderr, "%s: drmRadeonGetParam: %d\n", __FUNCTION__, r);
++         exit(1);
++       }
++    } else
++	boml->current_age = boml->screen->scratch[3];
++}
++
++static int legacy_is_pending(struct radeon_bo *bo)
++{
++    struct bo_manager_legacy *boml = (struct bo_manager_legacy *)bo->bom;
++    struct bo_legacy *bo_legacy = (struct bo_legacy*)bo;
++
++    if (bo_legacy->is_pending <= 0) {
++        bo_legacy->is_pending = 0;
++        return 0;
++    }
++    if (boml->current_age >= bo_legacy->pending) {
++        if (boml->pending_bos.pprev == bo_legacy) {
++            boml->pending_bos.pprev = bo_legacy->pprev;
++        }
++        bo_legacy->pprev->pnext = bo_legacy->pnext;
++        if (bo_legacy->pnext) {
++            bo_legacy->pnext->pprev = bo_legacy->pprev;
++        }
++	assert(bo_legacy->is_pending <= bo->cref);
++        while (bo_legacy->is_pending--) {
++	    bo = radeon_bo_unref(bo);
++	    if (!bo)
++	      break;
++        }
++	if (bo)
++	  bo_legacy->is_pending = 0;
++        boml->cpendings--;
++        return 0;
++    }
++    return 1;
++}
++
++static int legacy_wait_pending(struct radeon_bo *bo)
++{
++    struct bo_manager_legacy *boml = (struct bo_manager_legacy *)bo->bom;
++    struct bo_legacy *bo_legacy = (struct bo_legacy*)bo;
++
++    if (!bo_legacy->is_pending) {
++        return 0;
++    }
++    /* FIXME: lockup and userspace busy looping that's all the folks */
++    legacy_get_current_age(boml);
++    while (legacy_is_pending(bo)) {
++        usleep(10);
++        legacy_get_current_age(boml);
++    }
++    return 0;
++}
++
++static void legacy_track_pending(struct bo_manager_legacy *boml, int debug)
++{
++    struct bo_legacy *bo_legacy;
++    struct bo_legacy *next;
++
++    legacy_get_current_age(boml);
++    bo_legacy = boml->pending_bos.pnext;
++    while (bo_legacy) {
++        if (debug)
++	  fprintf(stderr,"pending %p %d %d %d\n", bo_legacy, bo_legacy->base.size,
++		  boml->current_age, bo_legacy->pending);
++        next = bo_legacy->pnext;
++        if (legacy_is_pending(&(bo_legacy->base))) {
++        }
++        bo_legacy = next;
++    } 
++}
++
++static int legacy_wait_any_pending(struct bo_manager_legacy *boml)
++{
++    struct bo_legacy *bo_legacy;
++
++    legacy_get_current_age(boml);
++    bo_legacy = boml->pending_bos.pnext;
++    if (!bo_legacy)
++      return -1;
++    legacy_wait_pending(&bo_legacy->base);
++    return 0;
++}
++
++static void legacy_kick_all_buffers(struct bo_manager_legacy *boml)
++{
++    struct bo_legacy *legacy;
++
++    legacy = boml->bos.next;
++    while (legacy != &boml->bos) {
++	if (legacy->tobj) {
++	    if (legacy->validated) {
++		driDestroyTextureObject(&legacy->tobj->base);
++		legacy->tobj = 0;
++		legacy->validated = 0;
++	    }
++	}
++	legacy = legacy->next;
++    }
++}
++
++static struct bo_legacy *bo_allocate(struct bo_manager_legacy *boml,
++                                     uint32_t size,
++                                     uint32_t alignment,
++                                     uint32_t domains,
++                                     uint32_t flags)
++{
++    struct bo_legacy *bo_legacy;
++    static int pgsize;
++
++    if (pgsize == 0)
++        pgsize = getpagesize() - 1;
++
++    size = (size + pgsize) & ~pgsize;
++
++    bo_legacy = (struct bo_legacy*)calloc(1, sizeof(struct bo_legacy));
++    if (bo_legacy == NULL) {
++        return NULL;
++    }
++    bo_legacy->base.bom = (struct radeon_bo_manager*)boml;
++    bo_legacy->base.handle = 0;
++    bo_legacy->base.size = size;
++    bo_legacy->base.alignment = alignment;
++    bo_legacy->base.domains = domains;
++    bo_legacy->base.flags = flags;
++    bo_legacy->base.ptr = NULL;
++    bo_legacy->map_count = 0;
++    bo_legacy->next = NULL;
++    bo_legacy->prev = NULL;
++    bo_legacy->pnext = NULL;
++    bo_legacy->pprev = NULL;
++    bo_legacy->next = boml->bos.next;
++    bo_legacy->prev = &boml->bos;
++    boml->bos.next = bo_legacy;
++    if (bo_legacy->next) {
++        bo_legacy->next->prev = bo_legacy;
++    }
++    return bo_legacy;
++}
++
++static int bo_dma_alloc(struct radeon_bo *bo)
++{
++    struct bo_manager_legacy *boml = (struct bo_manager_legacy *)bo->bom;
++    struct bo_legacy *bo_legacy = (struct bo_legacy*)bo;
++    drm_radeon_mem_alloc_t alloc;
++    unsigned size;
++    int base_offset;
++    int r;
++
++    /* align size on 4Kb */
++    size = (((4 * 1024) - 1) + bo->size) & ~((4 * 1024) - 1);
++    alloc.region = RADEON_MEM_REGION_GART;
++    alloc.alignment = bo_legacy->base.alignment;
++    alloc.size = size;
++    alloc.region_offset = &base_offset;
++    r = drmCommandWriteRead(bo->bom->fd,
++                            DRM_RADEON_ALLOC,
++                            &alloc,
++                            sizeof(alloc));
++    if (r) {
++        /* ptr is set to NULL if dma allocation failed */
++        bo_legacy->ptr = NULL;
++        return r;
++    }
++    bo_legacy->ptr = boml->screen->gartTextures.map + base_offset;
++    bo_legacy->offset = boml->screen->gart_texture_offset + base_offset;
++    bo->size = size;
++    boml->dma_alloc_size += size;
++    boml->dma_buf_count++;
++    return 0;
++}
++
++static int bo_dma_free(struct radeon_bo *bo)
++{
++    struct bo_manager_legacy *boml = (struct bo_manager_legacy *)bo->bom;
++    struct bo_legacy *bo_legacy = (struct bo_legacy*)bo;
++    drm_radeon_mem_free_t memfree;
++    int r;
++
++    if (bo_legacy->ptr == NULL) {
++        /* ptr is set to NULL if dma allocation failed */
++        return 0;
++    }
++    legacy_get_current_age(boml);
++    memfree.region = RADEON_MEM_REGION_GART;
++    memfree.region_offset  = bo_legacy->offset;
++    memfree.region_offset -= boml->screen->gart_texture_offset;
++    r = drmCommandWrite(boml->base.fd,
++                        DRM_RADEON_FREE,
++                        &memfree,
++                        sizeof(memfree));
++    if (r) {
++        fprintf(stderr, "Failed to free bo[%p] at %08x\n",
++                &bo_legacy->base, memfree.region_offset);
++        fprintf(stderr, "ret = %s\n", strerror(-r));
++        return r;
++    }
++    boml->dma_alloc_size -= bo_legacy->base.size;
++    boml->dma_buf_count--;
++    return 0;
++}
++
++static void bo_free(struct bo_legacy *bo_legacy)
++{
++    struct bo_manager_legacy *boml;
++
++    if (bo_legacy == NULL) {
++        return;
++    }
++    boml = (struct bo_manager_legacy *)bo_legacy->base.bom;
++    bo_legacy->prev->next = bo_legacy->next;
++    if (bo_legacy->next) {
++        bo_legacy->next->prev = bo_legacy->prev;
++    }
++    if (!bo_legacy->static_bo) {
++        legacy_free_handle(boml, bo_legacy->base.handle);
++        if (bo_legacy->base.domains & RADEON_GEM_DOMAIN_GTT) {
++            /* dma buffers */
++            bo_dma_free(&bo_legacy->base);
++        } else {
++  	    driDestroyTextureObject(&bo_legacy->tobj->base);
++	    bo_legacy->tobj = NULL;
++            /* free backing store */
++            free(bo_legacy->ptr);
++        }
++    }
++    memset(bo_legacy, 0 , sizeof(struct bo_legacy));
++    free(bo_legacy);
++}
++
++static struct radeon_bo *bo_open(struct radeon_bo_manager *bom,
++                                 uint32_t handle,
++                                 uint32_t size,
++                                 uint32_t alignment,
++                                 uint32_t domains,
++                                 uint32_t flags)
++{
++    struct bo_manager_legacy *boml = (struct bo_manager_legacy *)bom;
++    struct bo_legacy *bo_legacy;
++    int r;
++
++    if (handle) {
++        bo_legacy = boml->bos.next;
++        while (bo_legacy) {
++            if (bo_legacy->base.handle == handle) {
++                radeon_bo_ref(&(bo_legacy->base));
++                return (struct radeon_bo*)bo_legacy;
++            }
++            bo_legacy = bo_legacy->next;
++        }
++        return NULL;
++    }
++
++    bo_legacy = bo_allocate(boml, size, alignment, domains, flags);
++    bo_legacy->static_bo = 0;
++    r = legacy_new_handle(boml, &bo_legacy->base.handle);
++    if (r) {
++        bo_free(bo_legacy);
++        return NULL;
++    }
++    if (bo_legacy->base.domains & RADEON_GEM_DOMAIN_GTT) {
++    retry:
++        legacy_track_pending(boml, 0);
++        /* dma buffers */
++
++        r = bo_dma_alloc(&(bo_legacy->base));
++        if (r) {
++	  if (legacy_wait_any_pending(boml) == -1) {
++            bo_free(bo_legacy);
++	    return NULL;
++	  }
++	  goto retry;
++	  return NULL;
++        }
++    } else {
++        bo_legacy->ptr = malloc(bo_legacy->base.size);
++        if (bo_legacy->ptr == NULL) {
++            bo_free(bo_legacy);
++            return NULL;
++        }
++    }
++    radeon_bo_ref(&(bo_legacy->base));
++    return (struct radeon_bo*)bo_legacy;
++}
++
++static void bo_ref(struct radeon_bo *bo)
++{
++}
++
++static struct radeon_bo *bo_unref(struct radeon_bo *bo)
++{
++    struct bo_legacy *bo_legacy = (struct bo_legacy*)bo;
++
++    if (bo->cref <= 0) {
++        bo_legacy->prev->next = bo_legacy->next;
++        if (bo_legacy->next) {
++            bo_legacy->next->prev = bo_legacy->prev;
++        }
++        if (!bo_legacy->is_pending) {
++            bo_free(bo_legacy);
 +        }
 +        return NULL;
 +    }
@@ -1187,6 +20653,28 @@ index 0000000..f80f0f7
 +#include "radeon_cs_legacy.h"
 +
 +#endif
+diff --git a/src/mesa/drivers/dri/radeon/radeon_chipset.h b/src/mesa/drivers/dri/radeon/radeon_chipset.h
+index f6bd1eb..55a73ea 100644
+--- a/src/mesa/drivers/dri/radeon/radeon_chipset.h
++++ b/src/mesa/drivers/dri/radeon/radeon_chipset.h
+@@ -247,9 +247,6 @@
+ #define PCI_CHIP_RS350_7835		0x7835
+ #define PCI_CHIP_RS690_791E             0x791E
+ #define PCI_CHIP_RS690_791F             0x791F
+-#define PCI_CHIP_RS600_793F             0x793F
+-#define PCI_CHIP_RS600_7941             0x7941
+-#define PCI_CHIP_RS600_7942             0x7942
+ #define PCI_CHIP_RS740_796C             0x796C
+ #define PCI_CHIP_RS740_796D             0x796D
+ #define PCI_CHIP_RS740_796E             0x796E
+@@ -273,7 +270,6 @@ enum {
+    CHIP_FAMILY_R420,
+    CHIP_FAMILY_RV410,
+    CHIP_FAMILY_RS400,
+-   CHIP_FAMILY_RS600,
+    CHIP_FAMILY_RS690,
+    CHIP_FAMILY_RS740,
+    CHIP_FAMILY_RV515,
 diff --git a/src/mesa/drivers/dri/radeon/radeon_cmdbuf.h b/src/mesa/drivers/dri/radeon/radeon_cmdbuf.h
 new file mode 100644
 index 0000000..4b5116c
@@ -3361,12 +22849,1608 @@ index 0000000..a200e90
 +#endif
 +
 +#endif
-diff --git a/src/mesa/drivers/dri/radeon/radeon_cs_drm.h b/src/mesa/drivers/dri/radeon/radeon_cs_drm.h
+diff --git a/src/mesa/drivers/dri/radeon/radeon_compat.c b/src/mesa/drivers/dri/radeon/radeon_compat.c
+deleted file mode 100644
+index 46b490d..0000000
+--- a/src/mesa/drivers/dri/radeon/radeon_compat.c
++++ /dev/null
+@@ -1,301 +0,0 @@
+-/**************************************************************************
+-
+-Copyright 2002 ATI Technologies Inc., Ontario, Canada, and
+-               Tungsten Graphics Inc., Austin, Texas.
+-
+-All Rights Reserved.
+-
+-Permission is hereby granted, free of charge, to any person obtaining a
+-copy of this software and associated documentation files (the "Software"),
+-to deal in the Software without restriction, including without limitation
+-on the rights to use, copy, modify, merge, publish, distribute, sub
+-license, and/or sell copies of the Software, and to permit persons to whom
+-the Software is furnished to do so, subject to the following conditions:
+-
+-The above copyright notice and this permission notice (including the next
+-paragraph) shall be included in all copies or substantial portions of the
+-Software.
+-
+-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+-FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+-ATI, TUNGSTEN GRAPHICS AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+-DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+-OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+-USE OR OTHER DEALINGS IN THE SOFTWARE.
+-
+-**************************************************************************/
+-
+-/*
+- * Authors:
+- *   Keith Whitwell <keith@tungstengraphics.com>
+- *
+- */
+-
+-#include "main/glheader.h"
+-#include "main/imports.h"
+-
+-#include "radeon_context.h"
+-#include "radeon_state.h"
+-#include "radeon_ioctl.h"
+-
+-
+-static struct { 
+-	int start; 
+-	int len; 
+-	const char *name;
+-} packet[RADEON_MAX_STATE_PACKETS] = {
+-	{ RADEON_PP_MISC,7,"RADEON_PP_MISC" },
+-	{ RADEON_PP_CNTL,3,"RADEON_PP_CNTL" },
+-	{ RADEON_RB3D_COLORPITCH,1,"RADEON_RB3D_COLORPITCH" },
+-	{ RADEON_RE_LINE_PATTERN,2,"RADEON_RE_LINE_PATTERN" },
+-	{ RADEON_SE_LINE_WIDTH,1,"RADEON_SE_LINE_WIDTH" },
+-	{ RADEON_PP_LUM_MATRIX,1,"RADEON_PP_LUM_MATRIX" },
+-	{ RADEON_PP_ROT_MATRIX_0,2,"RADEON_PP_ROT_MATRIX_0" },
+-	{ RADEON_RB3D_STENCILREFMASK,3,"RADEON_RB3D_STENCILREFMASK" },
+-	{ RADEON_SE_VPORT_XSCALE,6,"RADEON_SE_VPORT_XSCALE" },
+-	{ RADEON_SE_CNTL,2,"RADEON_SE_CNTL" },
+-	{ RADEON_SE_CNTL_STATUS,1,"RADEON_SE_CNTL_STATUS" },
+-	{ RADEON_RE_MISC,1,"RADEON_RE_MISC" },
+-	{ RADEON_PP_TXFILTER_0,6,"RADEON_PP_TXFILTER_0" },
+-	{ RADEON_PP_BORDER_COLOR_0,1,"RADEON_PP_BORDER_COLOR_0" },
+-	{ RADEON_PP_TXFILTER_1,6,"RADEON_PP_TXFILTER_1" },
+-	{ RADEON_PP_BORDER_COLOR_1,1,"RADEON_PP_BORDER_COLOR_1" },
+-	{ RADEON_PP_TXFILTER_2,6,"RADEON_PP_TXFILTER_2" },
+-	{ RADEON_PP_BORDER_COLOR_2,1,"RADEON_PP_BORDER_COLOR_2" },
+-	{ RADEON_SE_ZBIAS_FACTOR,2,"RADEON_SE_ZBIAS_FACTOR" },
+-	{ RADEON_SE_TCL_OUTPUT_VTX_FMT,11,"RADEON_SE_TCL_OUTPUT_VTX_FMT" },
+-	{ RADEON_SE_TCL_MATERIAL_EMMISSIVE_RED,17,"RADEON_SE_TCL_MATERIAL_EMMISSIVE_RED" },
+-};
+-
+-
+-static void radeonCompatEmitPacket( radeonContextPtr rmesa, 
+-				    struct radeon_state_atom *state )
+-{
+-   drm_radeon_sarea_t *sarea = rmesa->sarea;
+-   drm_radeon_context_regs_t *ctx = &sarea->context_state;
+-   drm_radeon_texture_regs_t *tex0 = &sarea->tex_state[0];
+-   drm_radeon_texture_regs_t *tex1 = &sarea->tex_state[1];
+-   int i;
+-   int *buf = state->cmd;
+-
+-   for ( i = 0 ; i < state->cmd_size ; ) {
+-      drm_radeon_cmd_header_t *header = (drm_radeon_cmd_header_t *)&buf[i++];
+-
+-      if (RADEON_DEBUG & DEBUG_STATE)
+-	 fprintf(stderr, "%s %d: %s\n", __FUNCTION__, header->packet.packet_id,
+-		 packet[(int)header->packet.packet_id].name);
+-
+-      switch (header->packet.packet_id) {
+-      case RADEON_EMIT_PP_MISC:
+-	 ctx->pp_misc = buf[i++]; 
+-	 ctx->pp_fog_color = buf[i++];
+-	 ctx->re_solid_color = buf[i++];
+-	 ctx->rb3d_blendcntl = buf[i++];
+-	 ctx->rb3d_depthoffset = buf[i++];
+-	 ctx->rb3d_depthpitch = buf[i++];
+-	 ctx->rb3d_zstencilcntl = buf[i++];
+-	 sarea->dirty |= RADEON_UPLOAD_CONTEXT;
+-	 break;
+-      case RADEON_EMIT_PP_CNTL:
+-	 ctx->pp_cntl = buf[i++];
+-	 ctx->rb3d_cntl = buf[i++];
+-	 ctx->rb3d_coloroffset = buf[i++];
+-	 sarea->dirty |= RADEON_UPLOAD_CONTEXT;
+-	 break;
+-      case RADEON_EMIT_RB3D_COLORPITCH:
+-	 ctx->rb3d_colorpitch = buf[i++];
+-	 sarea->dirty |= RADEON_UPLOAD_CONTEXT;
+-	 break;
+-      case RADEON_EMIT_RE_LINE_PATTERN:
+-	 ctx->re_line_pattern = buf[i++];
+-	 ctx->re_line_state = buf[i++];
+-	 sarea->dirty |= RADEON_UPLOAD_LINE;
+-	 break;
+-      case RADEON_EMIT_SE_LINE_WIDTH:
+-	 ctx->se_line_width = buf[i++];
+-	 sarea->dirty |= RADEON_UPLOAD_LINE;
+-	 break;
+-      case RADEON_EMIT_PP_LUM_MATRIX:
+-	 ctx->pp_lum_matrix = buf[i++];
+-	 sarea->dirty |= RADEON_UPLOAD_BUMPMAP;
+-	 break;
+-      case RADEON_EMIT_PP_ROT_MATRIX_0:
+-	 ctx->pp_rot_matrix_0 = buf[i++];
+-	 ctx->pp_rot_matrix_1 = buf[i++];
+-	 sarea->dirty |= RADEON_UPLOAD_BUMPMAP;
+-	 break;
+-      case RADEON_EMIT_RB3D_STENCILREFMASK:
+-	 ctx->rb3d_stencilrefmask = buf[i++];
+-	 ctx->rb3d_ropcntl = buf[i++];
+-	 ctx->rb3d_planemask = buf[i++];
+-	 sarea->dirty |= RADEON_UPLOAD_MASKS;
+-	 break;
+-      case RADEON_EMIT_SE_VPORT_XSCALE:
+-	 ctx->se_vport_xscale = buf[i++];
+-	 ctx->se_vport_xoffset = buf[i++];
+-	 ctx->se_vport_yscale = buf[i++];
+-	 ctx->se_vport_yoffset = buf[i++];
+-	 ctx->se_vport_zscale = buf[i++];
+-	 ctx->se_vport_zoffset = buf[i++];
+-	 sarea->dirty |= RADEON_UPLOAD_VIEWPORT;
+-	 break;
+-      case RADEON_EMIT_SE_CNTL:
+-	 ctx->se_cntl = buf[i++];
+-	 ctx->se_coord_fmt = buf[i++];
+-	 sarea->dirty |= RADEON_UPLOAD_CONTEXT | RADEON_UPLOAD_VERTFMT;
+-	 break;
+-      case RADEON_EMIT_SE_CNTL_STATUS:
+-	 ctx->se_cntl_status = buf[i++];
+-	 sarea->dirty |= RADEON_UPLOAD_SETUP;
+-	 break;
+-      case RADEON_EMIT_RE_MISC:
+-	 ctx->re_misc = buf[i++];
+-	 sarea->dirty |= RADEON_UPLOAD_MISC;
+-	 break;
+-      case RADEON_EMIT_PP_TXFILTER_0:
+-	 tex0->pp_txfilter = buf[i++];
+-	 tex0->pp_txformat = buf[i++];
+-	 tex0->pp_txoffset = buf[i++];
+-	 tex0->pp_txcblend = buf[i++];
+-	 tex0->pp_txablend = buf[i++];
+-	 tex0->pp_tfactor = buf[i++];
+-	 sarea->dirty |= RADEON_UPLOAD_TEX0;
+-	 break;
+-      case RADEON_EMIT_PP_BORDER_COLOR_0:
+-	 tex0->pp_border_color = buf[i++];
+-	 sarea->dirty |= RADEON_UPLOAD_TEX0;
+-	 break;
+-      case RADEON_EMIT_PP_TXFILTER_1:
+-	 tex1->pp_txfilter = buf[i++];
+-	 tex1->pp_txformat = buf[i++];
+-	 tex1->pp_txoffset = buf[i++];
+-	 tex1->pp_txcblend = buf[i++];
+-	 tex1->pp_txablend = buf[i++];
+-	 tex1->pp_tfactor = buf[i++];
+-	 sarea->dirty |= RADEON_UPLOAD_TEX1;
+-	 break;
+-      case RADEON_EMIT_PP_BORDER_COLOR_1:
+-	 tex1->pp_border_color = buf[i++];
+-	 sarea->dirty |= RADEON_UPLOAD_TEX1;
+-	 break;
+-
+-      case RADEON_EMIT_SE_ZBIAS_FACTOR:
+-	 i++;
+-	 i++;
+-	 break;
+-
+-      case RADEON_EMIT_PP_TXFILTER_2:
+-      case RADEON_EMIT_PP_BORDER_COLOR_2:
+-      case RADEON_EMIT_SE_TCL_OUTPUT_VTX_FMT:
+-      case RADEON_EMIT_SE_TCL_MATERIAL_EMMISSIVE_RED:
+-      default:
+-	 /* These states aren't understood by radeon drm 1.1 */
+-	 fprintf(stderr, "Tried to emit unsupported state\n");
+-	 return;
+-      }
+-   }
+-}
+-
+-
+-
+-static void radeonCompatEmitStateLocked( radeonContextPtr rmesa )
+-{
+-   struct radeon_state_atom *atom;
+-
+-   if (RADEON_DEBUG & (DEBUG_STATE|DEBUG_PRIMS))
+-      fprintf(stderr, "%s\n", __FUNCTION__);
+-
+-   if (!rmesa->hw.is_dirty && !rmesa->hw.all_dirty)
+-      return;
+-
+-   foreach(atom, &rmesa->hw.atomlist) {
+-      if (rmesa->hw.all_dirty)
+-	 atom->dirty = GL_TRUE;
+-      if (atom->is_tcl)
+-	 atom->dirty = GL_FALSE;
+-      if (atom->dirty)
+-	 radeonCompatEmitPacket(rmesa, atom);
+-   }
+- 
+-   rmesa->hw.is_dirty = GL_FALSE;
+-   rmesa->hw.all_dirty = GL_FALSE;
+-}
+-
+-
+-static void radeonCompatEmitPrimitiveLocked( radeonContextPtr rmesa,
+-					     GLuint hw_primitive,
+-					     GLuint nverts,
+-					     drm_clip_rect_t *pbox,
+-					     GLuint nbox )
+-{
+-   int i;
+-
+-   for ( i = 0 ; i < nbox ; ) {
+-      int nr = MIN2( i + RADEON_NR_SAREA_CLIPRECTS, nbox );
+-      drm_clip_rect_t *b = rmesa->sarea->boxes;
+-      drm_radeon_vertex_t vtx;
+-      
+-      rmesa->sarea->dirty |= RADEON_UPLOAD_CLIPRECTS;
+-      rmesa->sarea->nbox = nr - i;
+-
+-      for ( ; i < nr ; i++) 
+-	 *b++ = pbox[i];
+-      
+-      if (RADEON_DEBUG & DEBUG_IOCTL)
+-	 fprintf(stderr, 
+-		 "RadeonFlushVertexBuffer: prim %x buf %d verts %d "
+-		 "disc %d nbox %d\n",
+-		 hw_primitive, 
+-		 rmesa->dma.current.buf->buf->idx, 
+-		 nverts, 
+-		 nr == nbox,
+-		 rmesa->sarea->nbox );
+-
+-      vtx.prim = hw_primitive;
+-      vtx.idx = rmesa->dma.current.buf->buf->idx;
+-      vtx.count = nverts;
+-      vtx.discard = (nr == nbox);      
+-
+-      drmCommandWrite( rmesa->dri.fd, 
+-		       DRM_RADEON_VERTEX,
+-		       &vtx, sizeof(vtx));
+-   }
+-}
+-
+-
+-
+-/* No 'start' for 1.1 vertices ioctl: only one vertex prim/buffer!  
+- */
+-void radeonCompatEmitPrimitive( radeonContextPtr rmesa,
+-				GLuint vertex_format,
+-				GLuint hw_primitive,
+-				GLuint nrverts )
+-{
+-   if (RADEON_DEBUG & DEBUG_IOCTL)
+-      fprintf(stderr, "%s\n", __FUNCTION__);
+-
+-   LOCK_HARDWARE( rmesa );
+-
+-   radeonCompatEmitStateLocked( rmesa );
+-   rmesa->sarea->vc_format = vertex_format;
+-   
+-   if (rmesa->state.scissor.enabled) {
+-      radeonCompatEmitPrimitiveLocked( rmesa, 
+-				       hw_primitive,
+-				       nrverts,
+-				       rmesa->state.scissor.pClipRects,
+-				       rmesa->state.scissor.numClipRects );
+-   }
+-   else {
+-      radeonCompatEmitPrimitiveLocked( rmesa, 
+-				       hw_primitive,
+-				       nrverts,
+-				       rmesa->pClipRects,
+-				       rmesa->numClipRects );
+-   }
+-
+-
+-   UNLOCK_HARDWARE( rmesa );
+-}
+-
+diff --git a/src/mesa/drivers/dri/radeon/radeon_context.c b/src/mesa/drivers/dri/radeon/radeon_context.c
+index 1e992c0..e4202c7 100644
+--- a/src/mesa/drivers/dri/radeon/radeon_context.c
++++ b/src/mesa/drivers/dri/radeon/radeon_context.c
+@@ -53,6 +53,7 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ #include "drivers/common/driverfuncs.h"
+ 
++#include "radeon_common.h"
+ #include "radeon_context.h"
+ #include "radeon_ioctl.h"
+ #include "radeon_state.h"
+@@ -62,9 +63,6 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ #include "radeon_tcl.h"
+ #include "radeon_maos.h"
+ 
+-#define need_GL_ARB_multisample
+-#define need_GL_ARB_texture_compression
+-#define need_GL_ARB_vertex_buffer_object
+ #define need_GL_EXT_blend_minmax
+ #define need_GL_EXT_fog_coord
+ #define need_GL_EXT_secondary_color
+@@ -75,55 +73,18 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ #include "vblank.h"
+ #include "utils.h"
+ #include "xmlpool.h" /* for symbolic values of enum-type options */
+-#ifndef RADEON_DEBUG
+-int RADEON_DEBUG = (0);
+-#endif
+-
+-
+-/* Return various strings for glGetString().
+- */
+-static const GLubyte *radeonGetString( GLcontext *ctx, GLenum name )
+-{
+-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+-   static char buffer[128];
+-   unsigned   offset;
+-   GLuint agp_mode = (rmesa->radeonScreen->card_type==RADEON_CARD_PCI) ? 0 :
+-      rmesa->radeonScreen->AGPMode;
+-
+-   switch ( name ) {
+-   case GL_VENDOR:
+-      return (GLubyte *)"Tungsten Graphics, Inc.";
+-
+-   case GL_RENDERER:
+-      offset = driGetRendererString( buffer, "Radeon", DRIVER_DATE,
+-				     agp_mode );
+-
+-      sprintf( & buffer[ offset ], " %sTCL",
+-	       !(rmesa->TclFallback & RADEON_TCL_FALLBACK_TCL_DISABLE)
+-	       ? "" : "NO-" );
+-
+-      return (GLubyte *)buffer;
+-
+-   default:
+-      return NULL;
+-   }
+-}
+-
+ 
+ /* Extension strings exported by the R100 driver.
+  */
+ const struct dri_extension card_extensions[] =
+ {
+-    { "GL_ARB_multisample",                GL_ARB_multisample_functions },
+     { "GL_ARB_multitexture",               NULL },
+     { "GL_ARB_texture_border_clamp",       NULL },
+-    { "GL_ARB_texture_compression",        GL_ARB_texture_compression_functions },
+     { "GL_ARB_texture_env_add",            NULL },
+     { "GL_ARB_texture_env_combine",        NULL },
+     { "GL_ARB_texture_env_crossbar",       NULL },
+     { "GL_ARB_texture_env_dot3",           NULL },
+     { "GL_ARB_texture_mirrored_repeat",    NULL },
+-    { "GL_ARB_vertex_buffer_object",       GL_ARB_vertex_buffer_object_functions },
+     { "GL_EXT_blend_logic_op",             NULL },
+     { "GL_EXT_blend_subtract",             GL_EXT_blend_minmax_functions },
+     { "GL_EXT_fog_coord",                  GL_EXT_fog_coord_functions },
+@@ -166,15 +127,6 @@ static const struct tnl_pipeline_stage *radeon_pipeline[] = {
+    NULL,
+ };
+ 
+-
+-
+-/* Initialize the driver's misc functions.
+- */
+-static void radeonInitDriverFuncs( struct dd_function_table *functions )
+-{
+-    functions->GetString	= radeonGetString;
+-}
+-
+ static const struct dri_debug_control debug_control[] =
+ {
+     { "fall",  DEBUG_FALLBACKS },
+@@ -194,6 +146,51 @@ static const struct dri_debug_control debug_control[] =
+     { NULL,    0 }
+ };
+ 
++static void r100_get_lock(radeonContextPtr radeon)
++{
++   r100ContextPtr rmesa = (r100ContextPtr)radeon;
++   drm_radeon_sarea_t *sarea = radeon->sarea;
++
++   RADEON_STATECHANGE(rmesa, ctx);
++   if (rmesa->radeon.sarea->tiling_enabled) {
++      rmesa->hw.ctx.cmd[CTX_RB3D_COLORPITCH] |=
++	 RADEON_COLOR_TILE_ENABLE;
++   } else {
++      rmesa->hw.ctx.cmd[CTX_RB3D_COLORPITCH] &=
++	 ~RADEON_COLOR_TILE_ENABLE;
++   }
++   
++   if (sarea->ctx_owner != rmesa->radeon.dri.hwContext) {
++      sarea->ctx_owner = rmesa->radeon.dri.hwContext;
++      
++      if (!radeon->radeonScreen->kernel_mm)
++         radeon_bo_legacy_texture_age(radeon->radeonScreen->bom);
++   }
++}
++
++static void r100_vtbl_emit_cs_header(struct radeon_cs *cs, radeonContextPtr rmesa)
++{
++}
++
++static void r100_vtbl_pre_emit_state(radeonContextPtr radeon)
++{
++   r100ContextPtr rmesa = (r100ContextPtr)radeon;
++   
++   /* r100 always needs to emit ZBS to avoid TCL lockups */
++   rmesa->hw.zbs.dirty = 1;
++   radeon->hw.is_dirty = 1;
++}
++
++
++static void r100_init_vtbl(radeonContextPtr radeon)
++{
++   radeon->vtbl.get_lock = r100_get_lock;
++   radeon->vtbl.update_viewport_offset = radeonUpdateViewportOffset;
++   radeon->vtbl.update_draw_buffer = radeonUpdateDrawBuffer;
++   radeon->vtbl.emit_cs_header = r100_vtbl_emit_cs_header;
++   radeon->vtbl.swtcl_flush = r100_swtcl_flush;
++   radeon->vtbl.pre_emit_state = r100_vtbl_pre_emit_state;
++}
+ 
+ /* Create the device specific context.
+  */
+@@ -205,8 +202,8 @@ radeonCreateContext( const __GLcontextModes *glVisual,
+    __DRIscreenPrivate *sPriv = driContextPriv->driScreenPriv;
+    radeonScreenPtr screen = (radeonScreenPtr)(sPriv->private);
+    struct dd_function_table functions;
+-   radeonContextPtr rmesa;
+-   GLcontext *ctx, *shareCtx;
++   r100ContextPtr rmesa;
++   GLcontext *ctx;
+    int i;
+    int tcl_mode, fthrottle_mode;
+ 
+@@ -215,10 +212,12 @@ radeonCreateContext( const __GLcontextModes *glVisual,
+    assert(screen);
+ 
+    /* Allocate the Radeon context */
+-   rmesa = (radeonContextPtr) CALLOC( sizeof(*rmesa) );
++   rmesa = (r100ContextPtr) CALLOC( sizeof(*rmesa) );
+    if ( !rmesa )
+       return GL_FALSE;
+ 
++   r100_init_vtbl(&rmesa->radeon);
++
+    /* init exp fog table data */
+    radeonInitStaticFogData();
+    
+@@ -226,12 +225,12 @@ radeonCreateContext( const __GLcontextModes *glVisual,
+     * Do this here so that initialMaxAnisotropy is set before we create
+     * the default textures.
+     */
+-   driParseConfigFiles (&rmesa->optionCache, &screen->optionCache,
++   driParseConfigFiles (&rmesa->radeon.optionCache, &screen->optionCache,
+ 			screen->driScreen->myNum, "radeon");
+-   rmesa->initialMaxAnisotropy = driQueryOptionf(&rmesa->optionCache,
++   rmesa->radeon.initialMaxAnisotropy = driQueryOptionf(&rmesa->radeon.optionCache,
+                                                  "def_max_anisotropy");
+ 
+-   if ( driQueryOptionb( &rmesa->optionCache, "hyperz" ) ) {
++   if ( driQueryOptionb( &rmesa->radeon.optionCache, "hyperz" ) ) {
+       if ( sPriv->drm_version.minor < 13 )
+ 	 fprintf( stderr, "DRM version 1.%d too old to support HyperZ, "
+ 			  "disabling.\n", sPriv->drm_version.minor );
+@@ -246,65 +245,23 @@ radeonCreateContext( const __GLcontextModes *glVisual,
+     * (the texture functions are especially important)
+     */
+    _mesa_init_driver_functions( &functions );
+-   radeonInitDriverFuncs( &functions );
+    radeonInitTextureFuncs( &functions );
+ 
+-   /* Allocate the Mesa context */
+-   if (sharedContextPrivate)
+-      shareCtx = ((radeonContextPtr) sharedContextPrivate)->glCtx;
+-   else
+-      shareCtx = NULL;
+-   rmesa->glCtx = _mesa_create_context(glVisual, shareCtx,
+-                                       &functions, (void *) rmesa);
+-   if (!rmesa->glCtx) {
+-      FREE(rmesa);
+-      return GL_FALSE;
+-   }
+-   driContextPriv->driverPrivate = rmesa;
+-
+-   /* Init radeon context data */
+-   rmesa->dri.context = driContextPriv;
+-   rmesa->dri.screen = sPriv;
+-   rmesa->dri.drawable = NULL;
+-   rmesa->dri.readable = NULL;
+-   rmesa->dri.hwContext = driContextPriv->hHWContext;
+-   rmesa->dri.hwLock = &sPriv->pSAREA->lock;
+-   rmesa->dri.fd = sPriv->fd;
+-   rmesa->dri.drmMinor = sPriv->drm_version.minor;
+-
+-   rmesa->radeonScreen = screen;
+-   rmesa->sarea = (drm_radeon_sarea_t *)((GLubyte *)sPriv->pSAREA +
+-				       screen->sarea_priv_offset);
+-
+-
+-   rmesa->dma.buf0_address = rmesa->radeonScreen->buffers->list[0].address;
+-
+-   (void) memset( rmesa->texture_heaps, 0, sizeof( rmesa->texture_heaps ) );
+-   make_empty_list( & rmesa->swapped );
+-
+-   rmesa->nr_heaps = screen->numTexHeaps;
+-   for ( i = 0 ; i < rmesa->nr_heaps ; i++ ) {
+-      rmesa->texture_heaps[i] = driCreateTextureHeap( i, rmesa,
+-	    screen->texSize[i],
+-	    12,
+-	    RADEON_NR_TEX_REGIONS,
+-	    (drmTextureRegionPtr)rmesa->sarea->tex_list[i],
+-	    & rmesa->sarea->tex_age[i],
+-	    & rmesa->swapped,
+-	    sizeof( radeonTexObj ),
+-	    (destroy_texture_object_t *) radeonDestroyTexObj );
+-
+-      driSetTextureSwapCounterLocation( rmesa->texture_heaps[i],
+-					& rmesa->c_textureSwaps );
++   if (!radeonInitContext(&rmesa->radeon, &functions,
++			  glVisual, driContextPriv,
++			  sharedContextPrivate)) {
++     FREE(rmesa);
++     return GL_FALSE;
+    }
+-   rmesa->texture_depth = driQueryOptioni (&rmesa->optionCache,
++
++   rmesa->radeon.texture_depth = driQueryOptioni (&rmesa->radeon.optionCache,
+ 					   "texture_depth");
+-   if (rmesa->texture_depth == DRI_CONF_TEXTURE_DEPTH_FB)
+-      rmesa->texture_depth = ( screen->cpp == 4 ) ?
++   if (rmesa->radeon.texture_depth == DRI_CONF_TEXTURE_DEPTH_FB)
++      rmesa->radeon.texture_depth = ( screen->cpp == 4 ) ?
+ 	 DRI_CONF_TEXTURE_DEPTH_32 : DRI_CONF_TEXTURE_DEPTH_16;
+ 
+-   rmesa->swtcl.RenderIndex = ~0;
+-   rmesa->hw.all_dirty = GL_TRUE;
++   rmesa->radeon.swtcl.RenderIndex = ~0;
++   rmesa->radeon.hw.all_dirty = GL_TRUE;
+ 
+    /* Set the maximum texture size small enough that we can guarentee that
+     * all texture units can bind a maximal texture and have all of them in
+@@ -312,26 +269,13 @@ radeonCreateContext( const __GLcontextModes *glVisual,
+     * setting allow larger textures.
+     */
+ 
+-   ctx = rmesa->glCtx;
+-   ctx->Const.MaxTextureUnits = driQueryOptioni (&rmesa->optionCache,
++   ctx = rmesa->radeon.glCtx;
++   ctx->Const.MaxTextureUnits = driQueryOptioni (&rmesa->radeon.optionCache,
+ 						 "texture_units");
+    ctx->Const.MaxTextureImageUnits = ctx->Const.MaxTextureUnits;
+    ctx->Const.MaxTextureCoordUnits = ctx->Const.MaxTextureUnits;
+ 
+-   i = driQueryOptioni( &rmesa->optionCache, "allow_large_textures");
+-
+-   driCalculateMaxTextureLevels( rmesa->texture_heaps,
+-				 rmesa->nr_heaps,
+-				 & ctx->Const,
+-				 4,
+-				 11, /* max 2D texture size is 2048x2048 */
+-				 8,  /* 256^3 */
+-				 9,  /* \todo: max cube texture size seems to be 512x512(x6) */
+-				 11, /* max rect texture size is 2048x2048. */
+-				 12,
+-				 GL_FALSE,
+-				 i );
+-
++   i = driQueryOptioni( &rmesa->radeon.optionCache, "allow_large_textures");
+ 
+    ctx->Const.MaxTextureMaxAnisotropy = 16.0;
+ 
+@@ -359,6 +303,8 @@ radeonCreateContext( const __GLcontextModes *glVisual,
+ 
+    rmesa->boxes = 0;
+ 
++   ctx->Const.MaxDrawBuffers = 1;
++
+    /* Initialize the software rasterizer and helper modules.
+     */
+    _swrast_CreateContext( ctx );
+@@ -392,38 +338,38 @@ radeonCreateContext( const __GLcontextModes *glVisual,
+    }
+ 
+    driInitExtensions( ctx, card_extensions, GL_TRUE );
+-   if (rmesa->radeonScreen->drmSupportsCubeMapsR100)
++   if (rmesa->radeon.radeonScreen->drmSupportsCubeMapsR100)
+       _mesa_enable_extension( ctx, "GL_ARB_texture_cube_map" );
+-   if (rmesa->glCtx->Mesa_DXTn) {
++   if (rmesa->radeon.glCtx->Mesa_DXTn) {
+       _mesa_enable_extension( ctx, "GL_EXT_texture_compression_s3tc" );
+       _mesa_enable_extension( ctx, "GL_S3_s3tc" );
+    }
+-   else if (driQueryOptionb (&rmesa->optionCache, "force_s3tc_enable")) {
++   else if (driQueryOptionb (&rmesa->radeon.optionCache, "force_s3tc_enable")) {
+       _mesa_enable_extension( ctx, "GL_EXT_texture_compression_s3tc" );
+    }
+ 
+-   if (rmesa->dri.drmMinor >= 9)
++   if (rmesa->radeon.dri.drmMinor >= 9)
+       _mesa_enable_extension( ctx, "GL_NV_texture_rectangle");
+ 
+    /* XXX these should really go right after _mesa_init_driver_functions() */
++   radeonInitSpanFuncs( ctx );
+    radeonInitIoctlFuncs( ctx );
+    radeonInitStateFuncs( ctx );
+-   radeonInitSpanFuncs( ctx );
+    radeonInitState( rmesa );
+    radeonInitSwtcl( ctx );
+ 
+    _mesa_vector4f_alloc( &rmesa->tcl.ObjClean, 0, 
+ 			 ctx->Const.MaxArrayLockSize, 32 );
+ 
+-   fthrottle_mode = driQueryOptioni(&rmesa->optionCache, "fthrottle_mode");
+-   rmesa->iw.irq_seq = -1;
+-   rmesa->irqsEmitted = 0;
+-   rmesa->do_irqs = (rmesa->radeonScreen->irq != 0 &&
+-		     fthrottle_mode == DRI_CONF_FTHROTTLE_IRQS);
++   fthrottle_mode = driQueryOptioni(&rmesa->radeon.optionCache, "fthrottle_mode");
++   rmesa->radeon.iw.irq_seq = -1;
++   rmesa->radeon.irqsEmitted = 0;
++   rmesa->radeon.do_irqs = (rmesa->radeon.radeonScreen->irq != 0 &&
++			    fthrottle_mode == DRI_CONF_FTHROTTLE_IRQS);
+ 
+-   rmesa->do_usleeps = (fthrottle_mode == DRI_CONF_FTHROTTLE_USLEEPS);
++   rmesa->radeon.do_usleeps = (fthrottle_mode == DRI_CONF_FTHROTTLE_USLEEPS);
+ 
+-   (*sPriv->systemTime->getUST)( & rmesa->swap_ust );
++   (*sPriv->systemTime->getUST)( & rmesa->radeon.swap_ust );
+ 
+ 
+ #if DO_DEBUG
+@@ -431,20 +377,20 @@ radeonCreateContext( const __GLcontextModes *glVisual,
+ 				       debug_control );
+ #endif
+ 
+-   tcl_mode = driQueryOptioni(&rmesa->optionCache, "tcl_mode");
+-   if (driQueryOptionb(&rmesa->optionCache, "no_rast")) {
++   tcl_mode = driQueryOptioni(&rmesa->radeon.optionCache, "tcl_mode");
++   if (driQueryOptionb(&rmesa->radeon.optionCache, "no_rast")) {
+       fprintf(stderr, "disabling 3D acceleration\n");
+       FALLBACK(rmesa, RADEON_FALLBACK_DISABLE, 1);
+    } else if (tcl_mode == DRI_CONF_TCL_SW ||
+-	      !(rmesa->radeonScreen->chip_flags & RADEON_CHIPSET_TCL)) {
+-      if (rmesa->radeonScreen->chip_flags & RADEON_CHIPSET_TCL) {
+-	 rmesa->radeonScreen->chip_flags &= ~RADEON_CHIPSET_TCL;
++	      !(rmesa->radeon.radeonScreen->chip_flags & RADEON_CHIPSET_TCL)) {
++      if (rmesa->radeon.radeonScreen->chip_flags & RADEON_CHIPSET_TCL) {
++	 rmesa->radeon.radeonScreen->chip_flags &= ~RADEON_CHIPSET_TCL;
+ 	 fprintf(stderr, "Disabling HW TCL support\n");
+       }
+-      TCL_FALLBACK(rmesa->glCtx, RADEON_TCL_FALLBACK_TCL_DISABLE, 1);
++      TCL_FALLBACK(rmesa->radeon.glCtx, RADEON_TCL_FALLBACK_TCL_DISABLE, 1);
+    }
+ 
+-   if (rmesa->radeonScreen->chip_flags & RADEON_CHIPSET_TCL) {
++   if (rmesa->radeon.radeonScreen->chip_flags & RADEON_CHIPSET_TCL) {
+ /*       _tnl_need_dlist_norm_lengths( ctx, GL_FALSE ); */
+    }
+    return GL_TRUE;
+@@ -458,179 +404,41 @@ radeonCreateContext( const __GLcontextModes *glVisual,
+ void radeonDestroyContext( __DRIcontextPrivate *driContextPriv )
+ {
+    GET_CURRENT_CONTEXT(ctx);
+-   radeonContextPtr rmesa = (radeonContextPtr) driContextPriv->driverPrivate;
+-   radeonContextPtr current = ctx ? RADEON_CONTEXT(ctx) : NULL;
++   r100ContextPtr rmesa = (r100ContextPtr) driContextPriv->driverPrivate;
++   r100ContextPtr current = ctx ? R100_CONTEXT(ctx) : NULL;
+ 
+    /* check if we're deleting the currently bound context */
+    if (rmesa == current) {
+-      RADEON_FIREVERTICES( rmesa );
++      radeon_firevertices(&rmesa->radeon);
+       _mesa_make_current(NULL, NULL, NULL);
+    }
+ 
+    /* Free radeon context resources */
+    assert(rmesa); /* should never be null */
+    if ( rmesa ) {
+-      GLboolean   release_texture_heaps;
+-
+ 
+-      release_texture_heaps = (rmesa->glCtx->Shared->RefCount == 1);
+-      _swsetup_DestroyContext( rmesa->glCtx );
+-      _tnl_DestroyContext( rmesa->glCtx );
+-      _vbo_DestroyContext( rmesa->glCtx );
+-      _swrast_DestroyContext( rmesa->glCtx );
++      _swsetup_DestroyContext( rmesa->radeon.glCtx );
++      _tnl_DestroyContext( rmesa->radeon.glCtx );
++      _vbo_DestroyContext( rmesa->radeon.glCtx );
++      _swrast_DestroyContext( rmesa->radeon.glCtx );
+ 
+-      radeonDestroySwtcl( rmesa->glCtx );
+-      radeonReleaseArrays( rmesa->glCtx, ~0 );
+-      if (rmesa->dma.current.buf) {
+-	 radeonReleaseDmaRegion( rmesa, &rmesa->dma.current, __FUNCTION__ );
+-	 radeonFlushCmdBuf( rmesa, __FUNCTION__ );
++      radeonDestroySwtcl( rmesa->radeon.glCtx );
++      radeonReleaseArrays( rmesa->radeon.glCtx, ~0 );
++      if (rmesa->radeon.dma.current) {
++	 radeonReleaseDmaRegion( &rmesa->radeon );
++	 rcommonFlushCmdBuf( &rmesa->radeon, __FUNCTION__ );
+       }
+ 
+       _mesa_vector4f_free( &rmesa->tcl.ObjClean );
+ 
+-      if (rmesa->state.scissor.pClipRects) {
+-	 FREE(rmesa->state.scissor.pClipRects);
+-	 rmesa->state.scissor.pClipRects = NULL;
+-      }
+-
+-      if ( release_texture_heaps ) {
+-         /* This share group is about to go away, free our private
+-          * texture object data.
+-          */
+-         int i;
+-
+-         for ( i = 0 ; i < rmesa->nr_heaps ; i++ ) {
+-	    driDestroyTextureHeap( rmesa->texture_heaps[ i ] );
+-	    rmesa->texture_heaps[ i ] = NULL;
+-         }
+-
+-	 assert( is_empty_list( & rmesa->swapped ) );
++      if (rmesa->radeon.state.scissor.pClipRects) {
++	 FREE(rmesa->radeon.state.scissor.pClipRects);
++	 rmesa->radeon.state.scissor.pClipRects = NULL;
+       }
+ 
+-      /* free the Mesa context */
+-      rmesa->glCtx->DriverCtx = NULL;
+-      _mesa_destroy_context( rmesa->glCtx );
+-
+-      /* free the option cache */
+-      driDestroyOptionCache (&rmesa->optionCache);
++      radeonCleanupContext(&rmesa->radeon);
+ 
+       FREE( rmesa );
+    }
+ }
+ 
+-
+-
+-
+-void
+-radeonSwapBuffers( __DRIdrawablePrivate *dPriv )
+-{
+-
+-   if (dPriv->driContextPriv && dPriv->driContextPriv->driverPrivate) {
+-      radeonContextPtr rmesa;
+-      GLcontext *ctx;
+-      rmesa = (radeonContextPtr) dPriv->driContextPriv->driverPrivate;
+-      ctx = rmesa->glCtx;
+-      if (ctx->Visual.doubleBufferMode) {
+-         _mesa_notifySwapBuffers( ctx );  /* flush pending rendering comands */
+-
+-         if ( rmesa->doPageFlip ) {
+-            radeonPageFlip( dPriv );
+-         }
+-         else {
+-	     radeonCopyBuffer( dPriv, NULL );
+-         }
+-      }
+-   }
+-   else {
+-      /* XXX this shouldn't be an error but we can't handle it for now */
+-      _mesa_problem(NULL, "%s: drawable has no context!", __FUNCTION__);
+-   }
+-}
+-
+-void radeonCopySubBuffer(__DRIdrawablePrivate * dPriv,
+-			 int x, int y, int w, int h )
+-{
+-    if (dPriv->driContextPriv && dPriv->driContextPriv->driverPrivate) {
+-	radeonContextPtr radeon;
+-	GLcontext *ctx;
+-
+-	radeon = (radeonContextPtr) dPriv->driContextPriv->driverPrivate;
+-	ctx = radeon->glCtx;
+-
+-	if (ctx->Visual.doubleBufferMode) {
+-	    drm_clip_rect_t rect;
+-	    rect.x1 = x + dPriv->x;
+-	    rect.y1 = (dPriv->h - y - h) + dPriv->y;
+-	    rect.x2 = rect.x1 + w;
+-	    rect.y2 = rect.y1 + h;
+-	    _mesa_notifySwapBuffers(ctx);	/* flush pending rendering comands */
+-	    radeonCopyBuffer(dPriv, &rect);
+-	}
+-    } else {
+-	/* XXX this shouldn't be an error but we can't handle it for now */
+-	_mesa_problem(NULL, "%s: drawable has no context!",
+-		      __FUNCTION__);
+-    }
+-}
+-
+-/* Make context `c' the current context and bind it to the given
+- * drawing and reading surfaces.
+- */
+-GLboolean
+-radeonMakeCurrent( __DRIcontextPrivate *driContextPriv,
+-                   __DRIdrawablePrivate *driDrawPriv,
+-                   __DRIdrawablePrivate *driReadPriv )
+-{
+-   if ( driContextPriv ) {
+-      radeonContextPtr newCtx = 
+-	 (radeonContextPtr) driContextPriv->driverPrivate;
+-
+-      if (RADEON_DEBUG & DEBUG_DRI)
+-	 fprintf(stderr, "%s ctx %p\n", __FUNCTION__, (void *) newCtx->glCtx);
+-
+-      newCtx->dri.readable = driReadPriv;
+-
+-      if ( (newCtx->dri.drawable != driDrawPriv) ||
+-           newCtx->lastStamp != driDrawPriv->lastStamp ) {
+-	 if (driDrawPriv->swap_interval == (unsigned)-1) {
+-	    driDrawPriv->vblFlags = (newCtx->radeonScreen->irq != 0)
+-	       ? driGetDefaultVBlankFlags(&newCtx->optionCache)
+-	       : VBLANK_FLAG_NO_IRQ;
+-
+-	    driDrawableInitVBlank( driDrawPriv );
+-	 }
+-
+-	 newCtx->dri.drawable = driDrawPriv;
+-
+-	 radeonSetCliprects(newCtx);
+-	 radeonUpdateViewportOffset( newCtx->glCtx );
+-      }
+-
+-      _mesa_make_current( newCtx->glCtx,
+-			  (GLframebuffer *) driDrawPriv->driverPrivate,
+-			  (GLframebuffer *) driReadPriv->driverPrivate );
+-
+-      _mesa_update_state( newCtx->glCtx );
+-   } else {
+-      if (RADEON_DEBUG & DEBUG_DRI)
+-	 fprintf(stderr, "%s ctx is null\n", __FUNCTION__);
+-      _mesa_make_current( NULL, NULL, NULL );
+-   }
+-
+-   if (RADEON_DEBUG & DEBUG_DRI)
+-      fprintf(stderr, "End %s\n", __FUNCTION__);
+-   return GL_TRUE;
+-}
+-
+-/* Force the context `c' to be unbound from its buffer.
+- */
+-GLboolean
+-radeonUnbindContext( __DRIcontextPrivate *driContextPriv )
+-{
+-   radeonContextPtr rmesa = (radeonContextPtr) driContextPriv->driverPrivate;
+-
+-   if (RADEON_DEBUG & DEBUG_DRI)
+-      fprintf(stderr, "%s ctx %p\n", __FUNCTION__, (void *) rmesa->glCtx);
+-
+-   return GL_TRUE;
+-}
+diff --git a/src/mesa/drivers/dri/radeon/radeon_context.h b/src/mesa/drivers/dri/radeon/radeon_context.h
+index 53df766..2efabd1 100644
+--- a/src/mesa/drivers/dri/radeon/radeon_context.h
++++ b/src/mesa/drivers/dri/radeon/radeon_context.h
+@@ -48,91 +48,23 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ #include "drm.h"
+ #include "radeon_drm.h"
+ #include "texmem.h"
+-
+ #include "main/macros.h"
+ #include "main/mtypes.h"
+ #include "main/colormac.h"
+-
+-struct radeon_context;
+-typedef struct radeon_context radeonContextRec;
+-typedef struct radeon_context *radeonContextPtr;
+-
+-/* This union is used to avoid warnings/miscompilation
+-   with float to uint32_t casts due to strict-aliasing */
+-typedef union {
+-	GLfloat f;
+-	uint32_t ui32;
+-} float_ui32_type;
+-
+-#include "radeon_lock.h"
+ #include "radeon_screen.h"
+-#include "main/mm.h"
+-
+-#include "math/m_vector.h"
+-
+-#define TEX_0   0x1
+-#define TEX_1   0x2
+-#define TEX_2   0x4
+-#define TEX_ALL 0x7
+-
+-/* Rasterizing fallbacks */
+-/* See correponding strings in r200_swtcl.c */
+-#define RADEON_FALLBACK_TEXTURE		0x0001
+-#define RADEON_FALLBACK_DRAW_BUFFER	0x0002
+-#define RADEON_FALLBACK_STENCIL		0x0004
+-#define RADEON_FALLBACK_RENDER_MODE	0x0008
+-#define RADEON_FALLBACK_BLEND_EQ	0x0010
+-#define RADEON_FALLBACK_BLEND_FUNC	0x0020
+-#define RADEON_FALLBACK_DISABLE 	0x0040
+-#define RADEON_FALLBACK_BORDER_MODE	0x0080
+-
+-/* The blit width for texture uploads
+- */
+-#define BLIT_WIDTH_BYTES 1024
+ 
+-/* Use the templated vertex format:
+- */
+-#define COLOR_IS_RGBA
+-#define TAG(x) radeon##x
+-#include "tnl_dd/t_dd_vertex.h"
+-#undef TAG
+-
+-typedef void (*radeon_tri_func) (radeonContextPtr,
+-				 radeonVertex *,
+-				 radeonVertex *, radeonVertex *);
+-
+-typedef void (*radeon_line_func) (radeonContextPtr,
+-				  radeonVertex *, radeonVertex *);
++#include "radeon_common.h"
+ 
+-typedef void (*radeon_point_func) (radeonContextPtr, radeonVertex *);
+-
+-struct radeon_colorbuffer_state {
+-	GLuint clear;
+-	int roundEnable;
+-};
+ 
+-struct radeon_depthbuffer_state {
+-	GLuint clear;
+-	GLfloat scale;
+-};
++struct r100_context;
++typedef struct r100_context r100ContextRec;
++typedef struct r100_context *r100ContextPtr;
+ 
+-struct radeon_scissor_state {
+-	drm_clip_rect_t rect;
+-	GLboolean enabled;
++#include "radeon_lock.h"
+ 
+-	GLuint numClipRects;	/* Cliprects active */
+-	GLuint numAllocedClipRects;	/* Cliprects available */
+-	drm_clip_rect_t *pClipRects;
+-};
+ 
+-struct radeon_stencilbuffer_state {
+-	GLboolean hwBuffer;
+-	GLuint clear;		/* rb3d_stencilrefmask value */
+-};
+ 
+-struct radeon_stipple_state {
+-	GLuint mask[32];
+-};
++#define R100_TEX_ALL 0x7
+ 
+ /* used for both tcl_vtx and vc_frmt tex bits (they are identical) */
+ #define RADEON_ST_BIT(unit) \
+@@ -141,42 +73,6 @@ struct radeon_stipple_state {
+ #define RADEON_Q_BIT(unit) \
+ (unit == 0 ? RADEON_CP_VC_FRMT_Q0 : (RADEON_CP_VC_FRMT_Q1 >> 2) << (2 * unit))
+ 
+-typedef struct radeon_tex_obj radeonTexObj, *radeonTexObjPtr;
+-
+-/* Texture object in locally shared texture space.
+- */
+-struct radeon_tex_obj {
+-	driTextureObject base;
+-
+-	GLuint bufAddr;		/* Offset to start of locally
+-				   shared texture block */
+-
+-	GLuint dirty_state;	/* Flags (1 per texunit) for
+-				   whether or not this texobj
+-				   has dirty hardware state
+-				   (pp_*) that needs to be
+-				   brought into the
+-				   texunit. */
+-
+-	drm_radeon_tex_image_t image[6][RADEON_MAX_TEXTURE_LEVELS];
+-	/* Six, for the cube faces */
+-
+-	GLboolean image_override; /* Image overridden by GLX_EXT_tfp */
+-
+-	GLuint pp_txfilter;	/* hardware register values */
+-	GLuint pp_txformat;
+-	GLuint pp_txoffset;	/* Image location in texmem.
+-				   All cube faces follow. */
+-	GLuint pp_txsize;	/* npot only */
+-	GLuint pp_txpitch;	/* npot only */
+-	GLuint pp_border_color;
+-	GLuint pp_cubic_faces;	/* cube face 1,2,3,4 log2 sizes */
+-
+-	GLboolean border_fallback;
+-
+-	GLuint tile_bits;	/* hw texture tile bits used on this texture */
+-};
+-
+ struct radeon_texture_env_state {
+ 	radeonTexObjPtr texobj;
+ 	GLenum format;
+@@ -187,17 +83,6 @@ struct radeon_texture_state {
+ 	struct radeon_texture_env_state unit[RADEON_MAX_TEXTURE_UNITS];
+ };
+ 
+-struct radeon_state_atom {
+-	struct radeon_state_atom *next, *prev;
+-	const char *name;	/* for debug */
+-	int cmd_size;		/* size in bytes */
+-	GLuint is_tcl;
+-	int *cmd;		/* one or more cmd's */
+-	int *lastcmd;		/* one or more cmd's */
+-	GLboolean dirty;	/* dirty-mark in emit_state_list */
+-	 GLboolean(*check) (GLcontext *);	/* is this state active? */
+-};
+-
+ /* Trying to keep these relatively short as the variables are becoming
+  * extravagently long.  Drop the driver name prefix off the front of
+  * everything - I think we know which driver we're in by now, and keep the
+@@ -410,10 +295,7 @@ struct radeon_state_atom {
+ #define SHN_SHININESS      1
+ #define SHN_STATE_SIZE     2
+ 
+-struct radeon_hw_state {
+-	/* Head of the linked list of state atoms. */
+-	struct radeon_state_atom atomlist;
+-
++struct r100_hw_state {
+ 	/* Hardware state, stored as cmdbuf commands:  
+ 	 *   -- Need to doublebuffer for
+ 	 *           - eliding noop statechange loops? (except line stipple count)
+@@ -438,86 +320,16 @@ struct radeon_hw_state {
+ 	struct radeon_state_atom glt;
+ 	struct radeon_state_atom txr[3];	/* for NPOT */
+ 
+-	int max_state_size;	/* Number of bytes necessary for a full state emit. */
+-	GLboolean is_dirty, all_dirty;
+ };
+ 
+-struct radeon_state {
+-	/* Derived state for internal purposes:
+-	 */
+-	struct radeon_colorbuffer_state color;
+-	struct radeon_depthbuffer_state depth;
+-	struct radeon_scissor_state scissor;
+-	struct radeon_stencilbuffer_state stencil;
++
++struct r100_state {
+ 	struct radeon_stipple_state stipple;
+ 	struct radeon_texture_state texture;
+ };
+ 
+-/* Need refcounting on dma buffers:
+- */
+-struct radeon_dma_buffer {
+-	int refcount;		/* the number of retained regions in buf */
+-	drmBufPtr buf;
+-};
+-
+-#define GET_START(rvb) (rmesa->radeonScreen->gart_buffer_offset +			\
+-			(rvb)->address - rmesa->dma.buf0_address +	\
+-			(rvb)->start)
+-
+-/* A retained region, eg vertices for indexed vertices.
+- */
+-struct radeon_dma_region {
+-	struct radeon_dma_buffer *buf;
+-	char *address;		/* == buf->address */
+-	int start, end, ptr;	/* offsets from start of buf */
+-	int aos_start;
+-	int aos_stride;
+-	int aos_size;
+-};
+-
+-struct radeon_dma {
+-	/* Active dma region.  Allocations for vertices and retained
+-	 * regions come from here.  Also used for emitting random vertices,
+-	 * these may be flushed by calling flush_current();
+-	 */
+-	struct radeon_dma_region current;
+-
+-	void (*flush) (radeonContextPtr);
+-
+-	char *buf0_address;	/* start of buf[0], for index calcs */
+-	GLuint nr_released_bufs;	/* flush after so many buffers released */
+-};
+-
+-struct radeon_dri_mirror {
+-	__DRIcontextPrivate *context;	/* DRI context */
+-	__DRIscreenPrivate *screen;	/* DRI screen */
+-
+-   /**
+-    * DRI drawable bound to this context for drawing.
+-    */
+-	__DRIdrawablePrivate *drawable;
+-
+-   /**
+-    * DRI drawable bound to this context for reading.
+-    */
+-	__DRIdrawablePrivate *readable;
+-
+-	drm_context_t hwContext;
+-	drm_hw_lock_t *hwLock;
+-	int fd;
+-	int drmMinor;
+-};
+-
+ #define RADEON_CMD_BUF_SZ  (8*1024)
+-
+-struct radeon_store {
+-	GLuint statenr;
+-	GLuint primnr;
+-	char cmd_buf[RADEON_CMD_BUF_SZ];
+-	int cmd_used;
+-	int elts_start;
+-};
+-
++#define R200_ELT_BUF_SZ  (8*1024)
+ /* radeon_tcl.c
+  */
+ struct radeon_tcl_info {
+@@ -529,30 +341,23 @@ struct radeon_tcl_info {
+ 	 */
+ 	GLvector4f ObjClean;
+ 
+-	struct radeon_dma_region *aos_components[8];
++        struct radeon_aos aos[8];
+ 	GLuint nr_aos_components;
+ 
+ 	GLuint *Elts;
+ 
+-	struct radeon_dma_region indexed_verts;
+-	struct radeon_dma_region obj;
+-	struct radeon_dma_region rgba;
+-	struct radeon_dma_region spec;
+-	struct radeon_dma_region fog;
+-	struct radeon_dma_region tex[RADEON_MAX_TEXTURE_UNITS];
+-	struct radeon_dma_region norm;
++	struct radeon_bo *indexed_bo;
++
++        int elt_cmd_offset; /** Offset into the cmdbuf */
++	int elt_cmd_start;
++        int elt_used;
+ };
+ 
+ /* radeon_swtcl.c
+  */
+-struct radeon_swtcl_info {
+-	GLuint RenderIndex;
+-	GLuint vertex_size;
++struct r100_swtcl_info {
+ 	GLuint vertex_format;
+ 
+-	struct tnl_attr_map vertex_attrs[VERT_ATTRIB_MAX];
+-	GLuint vertex_attr_count;
+-
+ 	GLubyte *verts;
+ 
+ 	/* Fallback rasterization functions
+@@ -561,10 +366,6 @@ struct radeon_swtcl_info {
+ 	radeon_line_func draw_line;
+ 	radeon_tri_func draw_tri;
+ 
+-	GLuint hw_primitive;
+-	GLenum render_primitive;
+-	GLuint numverts;
+-
+    /**
+     * Offset of the 4UB color data within a hardware (swtcl) vertex.
+     */
+@@ -576,22 +377,9 @@ struct radeon_swtcl_info {
+ 	GLuint specoffset;
+ 
+ 	GLboolean needproj;
+-
+-	struct radeon_dma_region indexed_verts;
+ };
+ 
+-struct radeon_ioctl {
+-	GLuint vertex_offset;
+-	GLuint vertex_size;
+-};
+ 
+-#define RADEON_MAX_PRIMS 64
+-
+-struct radeon_prim {
+-	GLuint start;
+-	GLuint end;
+-	GLuint prim;
+-};
+ 
+ /* A maximum total of 20 elements per vertex:  3 floats for position, 3
+  * floats for normal, 4 floats for color, 4 bytes for secondary color,
+@@ -602,59 +390,18 @@ struct radeon_prim {
+  */
+ #define RADEON_MAX_VERTEX_SIZE 20
+ 
+-struct radeon_context {
+-	GLcontext *glCtx;	/* Mesa context */
++struct r100_context {
++        struct radeon_context radeon;
+ 
+ 	/* Driver and hardware state management
+ 	 */
+-	struct radeon_hw_state hw;
+-	struct radeon_state state;
+-
+-	/* Texture object bookkeeping
+-	 */
+-	unsigned nr_heaps;
+-	driTexHeap *texture_heaps[RADEON_NR_TEX_HEAPS];
+-	driTextureObject swapped;
+-	int texture_depth;
+-	float initialMaxAnisotropy;
+-
+-	/* Rasterization and vertex state:
+-	 */
+-	GLuint TclFallback;
+-	GLuint Fallback;
+-	GLuint NewGLState;
+-	 DECLARE_RENDERINPUTS(tnl_index_bitset);	/* index of bits for last tnl_install_attrs */
++	struct r100_hw_state hw;
++	struct r100_state state;
+ 
+ 	/* Vertex buffers
+ 	 */
+ 	struct radeon_ioctl ioctl;
+-	struct radeon_dma dma;
+ 	struct radeon_store store;
+-	/* A full state emit as of the first state emit in the main store, in case
+-	 * the context is lost.
+-	 */
+-	struct radeon_store backup_store;
+-
+-	/* Page flipping
+-	 */
+-	GLuint doPageFlip;
+-
+-	/* Busy waiting
+-	 */
+-	GLuint do_usleeps;
+-	GLuint do_irqs;
+-	GLuint irqsEmitted;
+-	drm_radeon_irq_wait_t iw;
+-
+-	/* Drawable, cliprect and scissor information
+-	 */
+-	GLuint numClipRects;	/* Cliprects for the draw buffer */
+-	drm_clip_rect_t *pClipRects;
+-	unsigned int lastStamp;
+-	GLboolean lost_context;
+-	GLboolean save_on_next_emit;
+-	radeonScreenPtr radeonScreen;	/* Screen private DRI data */
+-	drm_radeon_sarea_t *sarea;	/* Private SAREA data */
+ 
+ 	/* TCL stuff
+ 	 */
+@@ -667,29 +414,13 @@ struct radeon_context {
+ 	GLmatrix tmpmat[RADEON_MAX_TEXTURE_UNITS];
+ 	GLuint last_ReallyEnabled;
+ 
+-	/* VBI
+-	 */
+-	int64_t swap_ust;
+-	int64_t swap_missed_ust;
+-
+-	GLuint swap_count;
+-	GLuint swap_missed_count;
+-
+ 	/* radeon_tcl.c
+ 	 */
+ 	struct radeon_tcl_info tcl;
+ 
+ 	/* radeon_swtcl.c
+ 	 */
+-	struct radeon_swtcl_info swtcl;
+-
+-	/* Mirrors of some DRI state
+-	 */
+-	struct radeon_dri_mirror dri;
+-
+-	/* Configuration cache
+-	 */
+-	driOptionCache optionCache;
++	struct r100_swtcl_info swtcl;
+ 
+ 	GLboolean using_hyperz;
+ 	GLboolean texmicrotile;
+@@ -703,23 +434,11 @@ struct radeon_context {
+ 	GLuint c_textureSwaps;
+ 	GLuint c_textureBytes;
+ 	GLuint c_vertexBuffers;
++
+ };
+ 
+-#define RADEON_CONTEXT(ctx)		((radeonContextPtr)(ctx->DriverCtx))
+-
+-static INLINE GLuint radeonPackColor(GLuint cpp,
+-                                     GLubyte r, GLubyte g,
+-                                     GLubyte b, GLubyte a)
+-{
+-	switch (cpp) {
+-	case 2:
+-		return PACK_COLOR_565(r, g, b);
+-	case 4:
+-		return PACK_COLOR_8888(a, r, g, b);
+-	default:
+-		return 0;
+-	}
+-}
++#define R100_CONTEXT(ctx)		((r100ContextPtr)(ctx->DriverCtx))
++
+ 
+ #define RADEON_OLD_PACKETS 1
+ 
+@@ -727,37 +446,11 @@ extern void radeonDestroyContext(__DRIcontextPrivate * driContextPriv);
+ extern GLboolean radeonCreateContext(const __GLcontextModes * glVisual,
+ 				     __DRIcontextPrivate * driContextPriv,
+ 				     void *sharedContextPrivate);
+-extern void radeonSwapBuffers(__DRIdrawablePrivate * dPriv);
+-extern void radeonCopySubBuffer(__DRIdrawablePrivate * dPriv,
+-				int x, int y, int w, int h);
+ extern GLboolean radeonMakeCurrent(__DRIcontextPrivate * driContextPriv,
+ 				   __DRIdrawablePrivate * driDrawPriv,
+ 				   __DRIdrawablePrivate * driReadPriv);
+ extern GLboolean radeonUnbindContext(__DRIcontextPrivate * driContextPriv);
+ 
+-/* ================================================================
+- * Debugging:
+- */
+-#define DO_DEBUG		1
+-
+-#if DO_DEBUG
+-extern int RADEON_DEBUG;
+-#else
+-#define RADEON_DEBUG		0
+-#endif
+-
+-#define DEBUG_TEXTURE	0x0001
+-#define DEBUG_STATE	0x0002
+-#define DEBUG_IOCTL	0x0004
+-#define DEBUG_PRIMS	0x0008
+-#define DEBUG_VERTS	0x0010
+-#define DEBUG_FALLBACKS	0x0020
+-#define DEBUG_VFMT	0x0040
+-#define DEBUG_CODEGEN	0x0080
+-#define DEBUG_VERBOSE	0x0100
+-#define DEBUG_DRI       0x0200
+-#define DEBUG_DMA       0x0400
+-#define DEBUG_SANITY    0x0800
+-#define DEBUG_SYNC      0x1000
++
+ 
+ #endif				/* __RADEON_CONTEXT_H__ */
+diff --git a/src/mesa/drivers/dri/radeon/radeon_cs_drm.h b/src/mesa/drivers/dri/radeon/radeon_cs_drm.h
+new file mode 100644
+index 0000000..984725a
+--- /dev/null
++++ b/src/mesa/drivers/dri/radeon/radeon_cs_drm.h
+@@ -0,0 +1,207 @@
++/* 
++ * Copyright © 2008 Nicolai Haehnle
++ * Copyright © 2008 Jérôme Glisse
++ * All Rights Reserved.
++ * 
++ * Permission is hereby granted, free of charge, to any person obtaining a
++ * copy of this software and associated documentation files (the
++ * "Software"), to deal in the Software without restriction, including
++ * without limitation the rights to use, copy, modify, merge, publish,
++ * distribute, sub license, and/or sell copies of the Software, and to
++ * permit persons to whom the Software is furnished to do so, subject to
++ * the following conditions:
++ * 
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
++ * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
++ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR 
++ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE 
++ * USE OR OTHER DEALINGS IN THE SOFTWARE.
++ *
++ * The above copyright notice and this permission notice (including the
++ * next paragraph) shall be included in all copies or substantial portions
++ * of the Software.
++ */
++/*
++ * Authors:
++ *      Aapo Tahkola <aet@rasterburn.org>
++ *      Nicolai Haehnle <prefect_@gmx.net>
++ *      Jérôme Glisse <glisse@freedesktop.org>
++ */
++#ifndef RADEON_CS_H
++#define RADEON_CS_H
++
++#include <stdint.h>
++#include <string.h>
++#include "drm.h"
++#include "radeon_drm.h"
++
++struct radeon_cs_reloc {
++    struct radeon_bo    *bo;
++    uint32_t            read_domain;
++    uint32_t            write_domain;
++    uint32_t            flags;
++};
++
++
++#define RADEON_CS_SPACE_OK 0
++#define RADEON_CS_SPACE_OP_TO_BIG 1
++#define RADEON_CS_SPACE_FLUSH 2
++
++struct radeon_cs_space_check {
++    struct radeon_bo *bo;
++    uint32_t read_domains;
++    uint32_t write_domain;
++    uint32_t new_accounted;
++};
++
++struct radeon_cs_manager;
++
++struct radeon_cs {
++    struct radeon_cs_manager    *csm;
++    void                        *relocs;
++    uint32_t                    *packets;
++    unsigned                    crelocs;
++    unsigned                    relocs_total_size;
++    unsigned                    cdw;
++    unsigned                    ndw;
++    int                         section;
++    unsigned                    section_ndw;
++    unsigned                    section_cdw;
++    const char                  *section_file;
++    const char                  *section_func;
++    int                         section_line;
++
++};
++
++/* cs functions */
++struct radeon_cs_funcs {
++    struct radeon_cs *(*cs_create)(struct radeon_cs_manager *csm,
++                                   uint32_t ndw);
++    int (*cs_write_reloc)(struct radeon_cs *cs,
++                          struct radeon_bo *bo,
++                          uint32_t read_domain,
++                          uint32_t write_domain,
++                          uint32_t flags);
++    int (*cs_begin)(struct radeon_cs *cs,
++                    uint32_t ndw,
++                    const char *file,
++                    const char *func,
++                    int line);
++    int (*cs_end)(struct radeon_cs *cs,
++                  const char *file,
++                  const char *func,
++                  int line);
++    int (*cs_emit)(struct radeon_cs *cs);
++    int (*cs_destroy)(struct radeon_cs *cs);
++    int (*cs_erase)(struct radeon_cs *cs);
++    int (*cs_need_flush)(struct radeon_cs *cs);
++    void (*cs_print)(struct radeon_cs *cs, FILE *file);
++    int (*cs_space_check)(struct radeon_cs *cs, struct radeon_cs_space_check *bos,
++			  int num_bo);
++};
++
++struct radeon_cs_manager {
++    struct radeon_cs_funcs  *funcs;
++    int                     fd;
++    uint32_t vram_limit, gart_limit;
++    uint32_t vram_write_used, gart_write_used;
++    uint32_t read_used;
++};
++
++static inline struct radeon_cs *radeon_cs_create(struct radeon_cs_manager *csm,
++                                                 uint32_t ndw)
++{
++    return csm->funcs->cs_create(csm, ndw);
++}
++
++static inline int radeon_cs_write_reloc(struct radeon_cs *cs,
++                                        struct radeon_bo *bo,
++                                        uint32_t read_domain,
++                                        uint32_t write_domain,
++                                        uint32_t flags)
++{
++    return cs->csm->funcs->cs_write_reloc(cs,
++                                          bo,
++                                          read_domain,
++                                          write_domain,
++                                          flags);
++}
++
++static inline int radeon_cs_begin(struct radeon_cs *cs,
++                                  uint32_t ndw,
++                                  const char *file,
++                                  const char *func,
++                                  int line)
++{
++    return cs->csm->funcs->cs_begin(cs, ndw, file, func, line);
++}
++
++static inline int radeon_cs_end(struct radeon_cs *cs,
++                                const char *file,
++                                const char *func,
++                                int line)
++{
++    return cs->csm->funcs->cs_end(cs, file, func, line);
++}
++
++static inline int radeon_cs_emit(struct radeon_cs *cs)
++{
++    return cs->csm->funcs->cs_emit(cs);
++}
++
++static inline int radeon_cs_destroy(struct radeon_cs *cs)
++{
++    return cs->csm->funcs->cs_destroy(cs);
++}
++
++static inline int radeon_cs_erase(struct radeon_cs *cs)
++{
++    return cs->csm->funcs->cs_erase(cs);
++}
++
++static inline int radeon_cs_need_flush(struct radeon_cs *cs)
++{
++    return cs->csm->funcs->cs_need_flush(cs);
++}
++
++static inline void radeon_cs_print(struct radeon_cs *cs, FILE *file)
++{
++    cs->csm->funcs->cs_print(cs, file);
++}
++
++static inline int radeon_cs_space_check(struct radeon_cs *cs,
++					    struct radeon_cs_space_check *bos,
++					    int num_bo)
++{
++    return cs->csm->funcs->cs_space_check(cs, bos, num_bo);
++}
++
++static inline void radeon_cs_set_limit(struct radeon_cs *cs, uint32_t domain, uint32_t limit)
++{
++    
++    if (domain == RADEON_GEM_DOMAIN_VRAM)
++	cs->csm->vram_limit = limit;
++    else
++	cs->csm->gart_limit = limit;
++}
++
++static inline void radeon_cs_write_dword(struct radeon_cs *cs, uint32_t dword)
++{
++    cs->packets[cs->cdw++] = dword;
++    if (cs->section) {
++        cs->section_cdw++;
++    }
++}
++
++static inline void radeon_cs_write_qword(struct radeon_cs *cs, uint64_t qword)
++{
++
++    memcpy(cs->packets + cs->cdw, &qword, sizeof(qword));
++    cs->cdw+=2;
++    if (cs->section) {
++        cs->section_cdw+=2;
++    }
++}
++#endif
+diff --git a/src/mesa/drivers/dri/radeon/radeon_cs_legacy.c b/src/mesa/drivers/dri/radeon/radeon_cs_legacy.c
 new file mode 100644
-index 0000000..984725a
+index 0000000..b47b095
 --- /dev/null
-+++ b/src/mesa/drivers/dri/radeon/radeon_cs_drm.h
-@@ -0,0 +1,207 @@
++++ b/src/mesa/drivers/dri/radeon/radeon_cs_legacy.c
+@@ -0,0 +1,504 @@
 +/* 
 + * Copyright © 2008 Nicolai Haehnle
 + * Copyright © 2008 Jérôme Glisse
@@ -3398,188 +24482,485 @@ index 0000000..984725a
 + *      Nicolai Haehnle <prefect_@gmx.net>
 + *      Jérôme Glisse <glisse@freedesktop.org>
 + */
-+#ifndef RADEON_CS_H
-+#define RADEON_CS_H
++#include <errno.h>
 +
-+#include <stdint.h>
-+#include <string.h>
-+#include "drm.h"
-+#include "radeon_drm.h"
++#include "radeon_bocs_wrapper.h"
 +
-+struct radeon_cs_reloc {
-+    struct radeon_bo    *bo;
-+    uint32_t            read_domain;
-+    uint32_t            write_domain;
-+    uint32_t            flags;
++struct cs_manager_legacy {
++    struct radeon_cs_manager    base;
++    struct radeon_context       *ctx;
++    /* hack for scratch stuff */
++    uint32_t                    pending_age;
++    uint32_t                    pending_count;
++
++
++};
++
++struct cs_reloc_legacy {
++    struct radeon_cs_reloc  base;
++    uint32_t                cindices;
++    uint32_t                *indices;
 +};
 +
 +
-+#define RADEON_CS_SPACE_OK 0
-+#define RADEON_CS_SPACE_OP_TO_BIG 1
-+#define RADEON_CS_SPACE_FLUSH 2
++static struct radeon_cs *cs_create(struct radeon_cs_manager *csm,
++                                   uint32_t ndw)
++{
++    struct radeon_cs *cs;
++
++    cs = (struct radeon_cs*)calloc(1, sizeof(struct radeon_cs));
++    if (cs == NULL) {
++        return NULL;
++    }
++    cs->csm = csm;
++    cs->ndw = (ndw + 0x3FF) & (~0x3FF);
++    cs->packets = (uint32_t*)malloc(4*cs->ndw);
++    if (cs->packets == NULL) {
++        free(cs);
++        return NULL;
++    }
++    cs->relocs_total_size = 0;
++    return cs;
++}
++
++static int cs_write_reloc(struct radeon_cs *cs,
++                          struct radeon_bo *bo,
++                          uint32_t read_domain,
++                          uint32_t write_domain,
++                          uint32_t flags)
++{
++    struct cs_reloc_legacy *relocs;
++    int i;
++
++    relocs = (struct cs_reloc_legacy *)cs->relocs;
++    /* check domains */
++    if ((read_domain && write_domain) || (!read_domain && !write_domain)) {
++        /* in one CS a bo can only be in read or write domain but not
++         * in read & write domain at the same sime
++         */
++        return -EINVAL;
++    }
++    if (read_domain == RADEON_GEM_DOMAIN_CPU) {
++        return -EINVAL;
++    }
++    if (write_domain == RADEON_GEM_DOMAIN_CPU) {
++        return -EINVAL;
++    }
++    /* check if bo is already referenced */
++    for(i = 0; i < cs->crelocs; i++) {
++        uint32_t *indices;
++
++        if (relocs[i].base.bo->handle == bo->handle) {
++            /* Check domains must be in read or write. As we check already
++             * checked that in argument one of the read or write domain was
++             * set we only need to check that if previous reloc as the read
++             * domain set then the read_domain should also be set for this
++             * new relocation.
++             */
++            if (relocs[i].base.read_domain && !read_domain) {
++                return -EINVAL;
++            }
++            if (relocs[i].base.write_domain && !write_domain) {
++                return -EINVAL;
++            }
++            relocs[i].base.read_domain |= read_domain;
++            relocs[i].base.write_domain |= write_domain;
++            /* save indice */
++            relocs[i].cindices++;
++            indices = (uint32_t*)realloc(relocs[i].indices,
++                                         relocs[i].cindices * 4);
++            if (indices == NULL) {
++                relocs[i].cindices -= 1;
++                return -ENOMEM;
++            }
++            relocs[i].indices = indices;
++            relocs[i].indices[relocs[i].cindices - 1] = cs->cdw - 1;
++            return 0;
++        }
++    }
++    /* add bo to reloc */
++    relocs = (struct cs_reloc_legacy*)
++             realloc(cs->relocs,
++                     sizeof(struct cs_reloc_legacy) * (cs->crelocs + 1));
++    if (relocs == NULL) {
++        return -ENOMEM;
++    }
++    cs->relocs = relocs;
++    relocs[cs->crelocs].base.bo = bo;
++    relocs[cs->crelocs].base.read_domain = read_domain;
++    relocs[cs->crelocs].base.write_domain = write_domain;
++    relocs[cs->crelocs].base.flags = flags;
++    relocs[cs->crelocs].indices = (uint32_t*)malloc(4);
++    if (relocs[cs->crelocs].indices == NULL) {
++        return -ENOMEM;
++    }
++    relocs[cs->crelocs].indices[0] = cs->cdw - 1;
++    relocs[cs->crelocs].cindices = 1;
++    cs->relocs_total_size += radeon_bo_legacy_relocs_size(bo);
++    cs->crelocs++;
++    radeon_bo_ref(bo);
++    return 0;
++}
++
++static int cs_begin(struct radeon_cs *cs,
++                    uint32_t ndw,
++                    const char *file,
++                    const char *func,
++                    int line)
++{
++    if (cs->section) {
++        fprintf(stderr, "CS already in a section(%s,%s,%d)\n",
++                cs->section_file, cs->section_func, cs->section_line);
++        fprintf(stderr, "CS can't start section(%s,%s,%d)\n",
++                file, func, line);
++        return -EPIPE;
++    }
++    cs->section = 1;
++    cs->section_ndw = ndw;
++    cs->section_cdw = 0;
++    cs->section_file = file;
++    cs->section_func = func;
++    cs->section_line = line;
++
++
++    if (cs->cdw + ndw > cs->ndw) {
++        uint32_t tmp, *ptr;
++	int num = (ndw > 0x3FF) ? ndw : 0x3FF;
++
++        tmp = (cs->cdw + 1 + num) & (~num);
++        ptr = (uint32_t*)realloc(cs->packets, 4 * tmp);
++        if (ptr == NULL) {
++            return -ENOMEM;
++        }
++        cs->packets = ptr;
++        cs->ndw = tmp;
++    }
++
++    return 0;
++}
++
++static int cs_end(struct radeon_cs *cs,
++                  const char *file,
++                  const char *func,
++                  int line)
++
++{
++    if (!cs->section) {
++        fprintf(stderr, "CS no section to end at (%s,%s,%d)\n",
++                file, func, line);
++        return -EPIPE;
++    }
++    cs->section = 0;
++    if (cs->section_ndw != cs->section_cdw) {
++        fprintf(stderr, "CS section size missmatch start at (%s,%s,%d) %d vs %d\n",
++                cs->section_file, cs->section_func, cs->section_line, cs->section_ndw, cs->section_cdw);
++        fprintf(stderr, "CS section end at (%s,%s,%d)\n",
++                file, func, line);
++        return -EPIPE;
++    }
++    return 0;
++}
++
++static int cs_process_relocs(struct radeon_cs *cs)
++{
++    struct cs_manager_legacy *csm = (struct cs_manager_legacy*)cs->csm;
++    struct cs_reloc_legacy *relocs;
++    int i, j, r;
++
++    csm = (struct cs_manager_legacy*)cs->csm;
++    relocs = (struct cs_reloc_legacy *)cs->relocs;
++ restart:
++    for (i = 0; i < cs->crelocs; i++) {
++        for (j = 0; j < relocs[i].cindices; j++) {
++            uint32_t soffset, eoffset;
++
++            r = radeon_bo_legacy_validate(relocs[i].base.bo,
++                                           &soffset, &eoffset);
++	    if (r == -EAGAIN)
++	      goto restart;
++            if (r) {
++                fprintf(stderr, "validated %p [0x%08X, 0x%08X]\n",
++                        relocs[i].base.bo, soffset, eoffset);
++                return r;
++            }
++            cs->packets[relocs[i].indices[j]] += soffset;
++            if (cs->packets[relocs[i].indices[j]] >= eoffset) {
++	      /*                radeon_bo_debug(relocs[i].base.bo, 12); */
++                fprintf(stderr, "validated %p [0x%08X, 0x%08X]\n",
++                        relocs[i].base.bo, soffset, eoffset);
++                fprintf(stderr, "above end: %p 0x%08X 0x%08X\n",
++                        relocs[i].base.bo,
++                        cs->packets[relocs[i].indices[j]],
++                        eoffset);
++                exit(0);
++                return -EINVAL;
++            }
++        }
++    }
++    return 0;
++}
++
++static int cs_set_age(struct radeon_cs *cs)
++{
++    struct cs_manager_legacy *csm = (struct cs_manager_legacy*)cs->csm;
++    struct cs_reloc_legacy *relocs;
++    int i;
++
++    relocs = (struct cs_reloc_legacy *)cs->relocs;
++    for (i = 0; i < cs->crelocs; i++) {
++        radeon_bo_legacy_pending(relocs[i].base.bo, csm->pending_age);
++        radeon_bo_unref(relocs[i].base.bo);
++    }
++    return 0;
++}
++
++static void dump_cmdbuf(struct radeon_cs *cs)
++{
++  int i;
++  for (i = 0; i < cs->cdw; i++){
++    fprintf(stderr,"%x: %08x\n", i, cs->packets[i]);
++  }
++
++}
++static int cs_emit(struct radeon_cs *cs)
++{
++    struct cs_manager_legacy *csm = (struct cs_manager_legacy*)cs->csm;
++    drm_radeon_cmd_buffer_t cmd;
++    drm_r300_cmd_header_t age;
++    uint64_t ull;
++    int r;
 +
-+struct radeon_cs_space_check {
-+    struct radeon_bo *bo;
-+    uint32_t read_domains;
-+    uint32_t write_domain;
-+    uint32_t new_accounted;
-+};
++    csm->ctx->vtbl.emit_cs_header(cs, csm->ctx);
 +
-+struct radeon_cs_manager;
++    /* append buffer age */
++    if (IS_R300_CLASS(csm->ctx->radeonScreen)) {
++      age.scratch.cmd_type = R300_CMD_SCRATCH;
++      /* Scratch register 2 corresponds to what radeonGetAge polls */
++      csm->pending_age = 0;
++      csm->pending_count = 1;
++      ull = (uint64_t) (intptr_t) &csm->pending_age;
++      age.scratch.reg = 2;
++      age.scratch.n_bufs = 1;
++      age.scratch.flags = 0;
++      radeon_cs_write_dword(cs, age.u);
++      radeon_cs_write_qword(cs, ull);
++      radeon_cs_write_dword(cs, 0);
++    }
 +
-+struct radeon_cs {
-+    struct radeon_cs_manager    *csm;
-+    void                        *relocs;
-+    uint32_t                    *packets;
-+    unsigned                    crelocs;
-+    unsigned                    relocs_total_size;
-+    unsigned                    cdw;
-+    unsigned                    ndw;
-+    int                         section;
-+    unsigned                    section_ndw;
-+    unsigned                    section_cdw;
-+    const char                  *section_file;
-+    const char                  *section_func;
-+    int                         section_line;
++    r = cs_process_relocs(cs);
++    if (r) {
++        return 0;
++    }
 +
-+};
++    cmd.buf = (char *)cs->packets;
++    cmd.bufsz = cs->cdw * 4;
++    if (csm->ctx->state.scissor.enabled) {
++        cmd.nbox = csm->ctx->state.scissor.numClipRects;
++        cmd.boxes = (drm_clip_rect_t *) csm->ctx->state.scissor.pClipRects;
++    } else {
++        cmd.nbox = csm->ctx->numClipRects;
++        cmd.boxes = (drm_clip_rect_t *) csm->ctx->pClipRects;
++    }
 +
-+/* cs functions */
-+struct radeon_cs_funcs {
-+    struct radeon_cs *(*cs_create)(struct radeon_cs_manager *csm,
-+                                   uint32_t ndw);
-+    int (*cs_write_reloc)(struct radeon_cs *cs,
-+                          struct radeon_bo *bo,
-+                          uint32_t read_domain,
-+                          uint32_t write_domain,
-+                          uint32_t flags);
-+    int (*cs_begin)(struct radeon_cs *cs,
-+                    uint32_t ndw,
-+                    const char *file,
-+                    const char *func,
-+                    int line);
-+    int (*cs_end)(struct radeon_cs *cs,
-+                  const char *file,
-+                  const char *func,
-+                  int line);
-+    int (*cs_emit)(struct radeon_cs *cs);
-+    int (*cs_destroy)(struct radeon_cs *cs);
-+    int (*cs_erase)(struct radeon_cs *cs);
-+    int (*cs_need_flush)(struct radeon_cs *cs);
-+    void (*cs_print)(struct radeon_cs *cs, FILE *file);
-+    int (*cs_space_check)(struct radeon_cs *cs, struct radeon_cs_space_check *bos,
-+			  int num_bo);
-+};
++    //dump_cmdbuf(cs);
 +
-+struct radeon_cs_manager {
-+    struct radeon_cs_funcs  *funcs;
-+    int                     fd;
-+    uint32_t vram_limit, gart_limit;
-+    uint32_t vram_write_used, gart_write_used;
-+    uint32_t read_used;
-+};
++    r = drmCommandWrite(cs->csm->fd, DRM_RADEON_CMDBUF, &cmd, sizeof(cmd));
++    if (r) {
++        return r;
++    }
++    if (!IS_R300_CLASS(csm->ctx->radeonScreen)) {
++	drm_radeon_irq_emit_t emit_cmd;
++	emit_cmd.irq_seq = &csm->pending_age;
++	r = drmCommandWrite(cs->csm->fd, DRM_RADEON_IRQ_EMIT, &emit_cmd, sizeof(emit_cmd));
++	if (r) {
++		return r;
++	}
++    }
++    cs_set_age(cs);
 +
-+static inline struct radeon_cs *radeon_cs_create(struct radeon_cs_manager *csm,
-+                                                 uint32_t ndw)
-+{
-+    return csm->funcs->cs_create(csm, ndw);
++    cs->csm->read_used = 0;
++    cs->csm->vram_write_used = 0;
++    cs->csm->gart_write_used = 0;
++    return 0;
 +}
 +
-+static inline int radeon_cs_write_reloc(struct radeon_cs *cs,
-+                                        struct radeon_bo *bo,
-+                                        uint32_t read_domain,
-+                                        uint32_t write_domain,
-+                                        uint32_t flags)
++static void inline cs_free_reloc(void *relocs_p, int crelocs)
 +{
-+    return cs->csm->funcs->cs_write_reloc(cs,
-+                                          bo,
-+                                          read_domain,
-+                                          write_domain,
-+                                          flags);
++    struct cs_reloc_legacy *relocs = relocs_p;
++    int i;
++    if (!relocs_p)
++      return;
++    for (i = 0; i < crelocs; i++)
++      free(relocs[i].indices);
 +}
 +
-+static inline int radeon_cs_begin(struct radeon_cs *cs,
-+                                  uint32_t ndw,
-+                                  const char *file,
-+                                  const char *func,
-+                                  int line)
++static int cs_destroy(struct radeon_cs *cs)
 +{
-+    return cs->csm->funcs->cs_begin(cs, ndw, file, func, line);
++    cs_free_reloc(cs->relocs, cs->crelocs);
++    free(cs->relocs);
++    free(cs->packets);
++    free(cs);
++    return 0;
 +}
 +
-+static inline int radeon_cs_end(struct radeon_cs *cs,
-+                                const char *file,
-+                                const char *func,
-+                                int line)
++static int cs_erase(struct radeon_cs *cs)
 +{
-+    return cs->csm->funcs->cs_end(cs, file, func, line);
++    cs_free_reloc(cs->relocs, cs->crelocs);
++    free(cs->relocs);
++    cs->relocs_total_size = 0;
++    cs->relocs = NULL;
++    cs->crelocs = 0;
++    cs->cdw = 0;
++    cs->section = 0;
++    return 0;
 +}
 +
-+static inline int radeon_cs_emit(struct radeon_cs *cs)
++static int cs_need_flush(struct radeon_cs *cs)
 +{
-+    return cs->csm->funcs->cs_emit(cs);
++    /* this function used to flush when the BO usage got to
++     * a certain size, now the higher levels handle this better */
++    return 0;
 +}
 +
-+static inline int radeon_cs_destroy(struct radeon_cs *cs)
++static void cs_print(struct radeon_cs *cs, FILE *file)
 +{
-+    return cs->csm->funcs->cs_destroy(cs);
 +}
 +
-+static inline int radeon_cs_erase(struct radeon_cs *cs)
++static int cs_check_space(struct radeon_cs *cs, struct radeon_cs_space_check *bos, int num_bo)
 +{
-+    return cs->csm->funcs->cs_erase(cs);
-+}
++    struct radeon_cs_manager *csm = cs->csm;
++    int this_op_read = 0, this_op_gart_write = 0, this_op_vram_write = 0;
++    uint32_t read_domains, write_domain;
++    int i;
++    struct radeon_bo *bo;
 +
-+static inline int radeon_cs_need_flush(struct radeon_cs *cs)
-+{
-+    return cs->csm->funcs->cs_need_flush(cs);
-+}
++    /* check the totals for this operation */
++
++    if (num_bo == 0)
++        return 0;
++
++    /* prepare */
++    for (i = 0; i < num_bo; i++) {
++      bo = bos[i].bo;
++
++      bos[i].new_accounted = 0;
++      read_domains = bos[i].read_domains;
++      write_domain = bos[i].write_domain;
++		
++      /* pinned bos don't count */
++      if (radeon_legacy_bo_is_static(bo))
++	  continue;
++ 
++      /* already accounted this bo */
++      if (write_domain && (write_domain == bo->space_accounted))
++	  continue;
++
++      if (read_domains && ((read_domains << 16) == bo->space_accounted))
++	  continue;
++      
++      if (bo->space_accounted == 0) {
++	  if (write_domain == RADEON_GEM_DOMAIN_VRAM)
++	      this_op_vram_write += bo->size;
++	  else if (write_domain == RADEON_GEM_DOMAIN_GTT)
++	      this_op_gart_write += bo->size;
++	  else
++	      this_op_read += bo->size;
++	  bos[i].new_accounted = (read_domains << 16) | write_domain;
++      } else {
++	  uint16_t old_read, old_write;
++	  
++	  old_read = bo->space_accounted >> 16;
++	  old_write = bo->space_accounted & 0xffff;
++
++	  if (write_domain && (old_read & write_domain)) {
++	      bos[i].new_accounted = write_domain;
++	      /* moving from read to a write domain */
++	      if (write_domain == RADEON_GEM_DOMAIN_VRAM) {
++		  this_op_read -= bo->size;
++		  this_op_vram_write += bo->size;
++	      } else if (write_domain == RADEON_GEM_DOMAIN_VRAM) {
++		  this_op_read -= bo->size;
++		  this_op_gart_write += bo->size;
++	      }
++	  } else if (read_domains & old_write) {
++	      bos[i].new_accounted = bo->space_accounted & 0xffff;
++	  } else {
++	      /* rewrite the domains */
++	      if (write_domain != old_write)
++		  fprintf(stderr,"WRITE DOMAIN RELOC FAILURE 0x%x %d %d\n", bo->handle, write_domain, old_write);
++	      if (read_domains != old_read)
++		  fprintf(stderr,"READ DOMAIN RELOC FAILURE 0x%x %d %d\n", bo->handle, read_domains, old_read);
++	      return RADEON_CS_SPACE_FLUSH;
++	  }
++      }
++	}
++	
++	if (this_op_read < 0)
++		this_op_read = 0;
++
++	/* check sizes - operation first */
++	if ((this_op_read + this_op_gart_write > csm->gart_limit) ||
++	    (this_op_vram_write > csm->vram_limit)) {
++	    return RADEON_CS_SPACE_OP_TO_BIG;
++	}
++
++	if (((csm->vram_write_used + this_op_vram_write) > csm->vram_limit) ||
++	    ((csm->read_used + csm->gart_write_used + this_op_gart_write + this_op_read) > csm->gart_limit)) {
++		return RADEON_CS_SPACE_FLUSH;
++	}
 +
-+static inline void radeon_cs_print(struct radeon_cs *cs, FILE *file)
-+{
-+    cs->csm->funcs->cs_print(cs, file);
-+}
++	csm->gart_write_used += this_op_gart_write;
++	csm->vram_write_used += this_op_vram_write;
++	csm->read_used += this_op_read;
++	/* commit */
++	for (i = 0; i < num_bo; i++) {
++		bo = bos[i].bo;
++		bo->space_accounted = bos[i].new_accounted;
++	}
 +
-+static inline int radeon_cs_space_check(struct radeon_cs *cs,
-+					    struct radeon_cs_space_check *bos,
-+					    int num_bo)
-+{
-+    return cs->csm->funcs->cs_space_check(cs, bos, num_bo);
++	return RADEON_CS_SPACE_OK;
 +}
 +
-+static inline void radeon_cs_set_limit(struct radeon_cs *cs, uint32_t domain, uint32_t limit)
-+{
-+    
-+    if (domain == RADEON_GEM_DOMAIN_VRAM)
-+	cs->csm->vram_limit = limit;
-+    else
-+	cs->csm->gart_limit = limit;
-+}
++static struct radeon_cs_funcs  radeon_cs_legacy_funcs = {
++    cs_create,
++    cs_write_reloc,
++    cs_begin,
++    cs_end,
++    cs_emit,
++    cs_destroy,
++    cs_erase,
++    cs_need_flush,
++    cs_print,
++    cs_check_space
++};
 +
-+static inline void radeon_cs_write_dword(struct radeon_cs *cs, uint32_t dword)
++struct radeon_cs_manager *radeon_cs_manager_legacy_ctor(struct radeon_context *ctx)
 +{
-+    cs->packets[cs->cdw++] = dword;
-+    if (cs->section) {
-+        cs->section_cdw++;
++    struct cs_manager_legacy *csm;
++
++    csm = (struct cs_manager_legacy*)
++          calloc(1, sizeof(struct cs_manager_legacy));
++    if (csm == NULL) {
++        return NULL;
 +    }
++    csm->base.funcs = &radeon_cs_legacy_funcs;
++    csm->base.fd = ctx->dri.fd;
++    csm->ctx = ctx;
++    csm->pending_age = 1;
++    return (struct radeon_cs_manager*)csm;
 +}
 +
-+static inline void radeon_cs_write_qword(struct radeon_cs *cs, uint64_t qword)
++void radeon_cs_manager_legacy_dtor(struct radeon_cs_manager *csm)
 +{
-+
-+    memcpy(cs->packets + cs->cdw, &qword, sizeof(qword));
-+    cs->cdw+=2;
-+    if (cs->section) {
-+        cs->section_cdw+=2;
-+    }
++    free(csm);
 +}
-+#endif
-diff --git a/src/mesa/drivers/dri/radeon/radeon_cs_legacy.c b/src/mesa/drivers/dri/radeon/radeon_cs_legacy.c
++
+diff --git a/src/mesa/drivers/dri/radeon/radeon_cs_legacy.h b/src/mesa/drivers/dri/radeon/radeon_cs_legacy.h
 new file mode 100644
-index 0000000..b47b095
+index 0000000..e177b4b
 --- /dev/null
-+++ b/src/mesa/drivers/dri/radeon/radeon_cs_legacy.c
-@@ -0,0 +1,504 @@
++++ b/src/mesa/drivers/dri/radeon/radeon_cs_legacy.h
+@@ -0,0 +1,40 @@
 +/* 
 + * Copyright © 2008 Nicolai Haehnle
 + * Copyright © 2008 Jérôme Glisse
@@ -3611,531 +24992,350 @@ index 0000000..b47b095
 + *      Nicolai Haehnle <prefect_@gmx.net>
 + *      Jérôme Glisse <glisse@freedesktop.org>
 + */
-+#include <errno.h>
-+
-+#include "radeon_bocs_wrapper.h"
-+
-+struct cs_manager_legacy {
-+    struct radeon_cs_manager    base;
-+    struct radeon_context       *ctx;
-+    /* hack for scratch stuff */
-+    uint32_t                    pending_age;
-+    uint32_t                    pending_count;
-+
-+
-+};
-+
-+struct cs_reloc_legacy {
-+    struct radeon_cs_reloc  base;
-+    uint32_t                cindices;
-+    uint32_t                *indices;
-+};
-+
-+
-+static struct radeon_cs *cs_create(struct radeon_cs_manager *csm,
-+                                   uint32_t ndw)
-+{
-+    struct radeon_cs *cs;
++#ifndef RADEON_CS_LEGACY_H
++#define RADEON_CS_LEGACY_H
 +
-+    cs = (struct radeon_cs*)calloc(1, sizeof(struct radeon_cs));
-+    if (cs == NULL) {
-+        return NULL;
-+    }
-+    cs->csm = csm;
-+    cs->ndw = (ndw + 0x3FF) & (~0x3FF);
-+    cs->packets = (uint32_t*)malloc(4*cs->ndw);
-+    if (cs->packets == NULL) {
-+        free(cs);
-+        return NULL;
-+    }
-+    cs->relocs_total_size = 0;
-+    return cs;
-+}
++#include "radeon_common.h"
 +
-+static int cs_write_reloc(struct radeon_cs *cs,
-+                          struct radeon_bo *bo,
-+                          uint32_t read_domain,
-+                          uint32_t write_domain,
-+                          uint32_t flags)
-+{
-+    struct cs_reloc_legacy *relocs;
-+    int i;
++struct radeon_cs_manager *radeon_cs_manager_legacy_ctor(struct radeon_context *ctx);
++void radeon_cs_manager_legacy_dtor(struct radeon_cs_manager *csm);
 +
-+    relocs = (struct cs_reloc_legacy *)cs->relocs;
-+    /* check domains */
-+    if ((read_domain && write_domain) || (!read_domain && !write_domain)) {
-+        /* in one CS a bo can only be in read or write domain but not
-+         * in read & write domain at the same sime
-+         */
-+        return -EINVAL;
-+    }
-+    if (read_domain == RADEON_GEM_DOMAIN_CPU) {
-+        return -EINVAL;
-+    }
-+    if (write_domain == RADEON_GEM_DOMAIN_CPU) {
-+        return -EINVAL;
-+    }
-+    /* check if bo is already referenced */
-+    for(i = 0; i < cs->crelocs; i++) {
-+        uint32_t *indices;
++#endif
+diff --git a/src/mesa/drivers/dri/radeon/radeon_dma.c b/src/mesa/drivers/dri/radeon/radeon_dma.c
+new file mode 100644
+index 0000000..393b121
+--- /dev/null
++++ b/src/mesa/drivers/dri/radeon/radeon_dma.c
+@@ -0,0 +1,323 @@
++/**************************************************************************
 +
-+        if (relocs[i].base.bo->handle == bo->handle) {
-+            /* Check domains must be in read or write. As we check already
-+             * checked that in argument one of the read or write domain was
-+             * set we only need to check that if previous reloc as the read
-+             * domain set then the read_domain should also be set for this
-+             * new relocation.
-+             */
-+            if (relocs[i].base.read_domain && !read_domain) {
-+                return -EINVAL;
-+            }
-+            if (relocs[i].base.write_domain && !write_domain) {
-+                return -EINVAL;
-+            }
-+            relocs[i].base.read_domain |= read_domain;
-+            relocs[i].base.write_domain |= write_domain;
-+            /* save indice */
-+            relocs[i].cindices++;
-+            indices = (uint32_t*)realloc(relocs[i].indices,
-+                                         relocs[i].cindices * 4);
-+            if (indices == NULL) {
-+                relocs[i].cindices -= 1;
-+                return -ENOMEM;
-+            }
-+            relocs[i].indices = indices;
-+            relocs[i].indices[relocs[i].cindices - 1] = cs->cdw - 1;
-+            return 0;
-+        }
-+    }
-+    /* add bo to reloc */
-+    relocs = (struct cs_reloc_legacy*)
-+             realloc(cs->relocs,
-+                     sizeof(struct cs_reloc_legacy) * (cs->crelocs + 1));
-+    if (relocs == NULL) {
-+        return -ENOMEM;
-+    }
-+    cs->relocs = relocs;
-+    relocs[cs->crelocs].base.bo = bo;
-+    relocs[cs->crelocs].base.read_domain = read_domain;
-+    relocs[cs->crelocs].base.write_domain = write_domain;
-+    relocs[cs->crelocs].base.flags = flags;
-+    relocs[cs->crelocs].indices = (uint32_t*)malloc(4);
-+    if (relocs[cs->crelocs].indices == NULL) {
-+        return -ENOMEM;
-+    }
-+    relocs[cs->crelocs].indices[0] = cs->cdw - 1;
-+    relocs[cs->crelocs].cindices = 1;
-+    cs->relocs_total_size += radeon_bo_legacy_relocs_size(bo);
-+    cs->crelocs++;
-+    radeon_bo_ref(bo);
-+    return 0;
-+}
++Copyright (C) 2004 Nicolai Haehnle.
++Copyright (C) The Weather Channel, Inc.  2002.  All Rights Reserved.
 +
-+static int cs_begin(struct radeon_cs *cs,
-+                    uint32_t ndw,
-+                    const char *file,
-+                    const char *func,
-+                    int line)
-+{
-+    if (cs->section) {
-+        fprintf(stderr, "CS already in a section(%s,%s,%d)\n",
-+                cs->section_file, cs->section_func, cs->section_line);
-+        fprintf(stderr, "CS can't start section(%s,%s,%d)\n",
-+                file, func, line);
-+        return -EPIPE;
-+    }
-+    cs->section = 1;
-+    cs->section_ndw = ndw;
-+    cs->section_cdw = 0;
-+    cs->section_file = file;
-+    cs->section_func = func;
-+    cs->section_line = line;
++The Weather Channel (TM) funded Tungsten Graphics to develop the
++initial release of the Radeon 8500 driver under the XFree86 license.
++This notice must be preserved.
 +
++All Rights Reserved.
 +
-+    if (cs->cdw + ndw > cs->ndw) {
-+        uint32_t tmp, *ptr;
-+	int num = (ndw > 0x3FF) ? ndw : 0x3FF;
++Permission is hereby granted, free of charge, to any person obtaining a
++copy of this software and associated documentation files (the "Software"),
++to deal in the Software without restriction, including without limitation
++on the rights to use, copy, modify, merge, publish, distribute, sub
++license, and/or sell copies of the Software, and to permit persons to whom
++the Software is furnished to do so, subject to the following conditions:
 +
-+        tmp = (cs->cdw + 1 + num) & (~num);
-+        ptr = (uint32_t*)realloc(cs->packets, 4 * tmp);
-+        if (ptr == NULL) {
-+            return -ENOMEM;
-+        }
-+        cs->packets = ptr;
-+        cs->ndw = tmp;
-+    }
++The above copyright notice and this permission notice (including the next
++paragraph) shall be included in all copies or substantial portions of the
++Software.
 +
-+    return 0;
-+}
++THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
++ATI, VA LINUX SYSTEMS AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
++DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
++OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
++USE OR OTHER DEALINGS IN THE SOFTWARE.
 +
-+static int cs_end(struct radeon_cs *cs,
-+                  const char *file,
-+                  const char *func,
-+                  int line)
++**************************************************************************/
 +
-+{
-+    if (!cs->section) {
-+        fprintf(stderr, "CS no section to end at (%s,%s,%d)\n",
-+                file, func, line);
-+        return -EPIPE;
-+    }
-+    cs->section = 0;
-+    if (cs->section_ndw != cs->section_cdw) {
-+        fprintf(stderr, "CS section size missmatch start at (%s,%s,%d) %d vs %d\n",
-+                cs->section_file, cs->section_func, cs->section_line, cs->section_ndw, cs->section_cdw);
-+        fprintf(stderr, "CS section end at (%s,%s,%d)\n",
-+                file, func, line);
-+        return -EPIPE;
-+    }
-+    return 0;
-+}
++#include "radeon_common.h"
 +
-+static int cs_process_relocs(struct radeon_cs *cs)
++#if defined(USE_X86_ASM)
++#define COPY_DWORDS( dst, src, nr )					\
++do {									\
++	int __tmp;							\
++	__asm__ __volatile__( "rep ; movsl"				\
++			      : "=%c" (__tmp), "=D" (dst), "=S" (__tmp)	\
++			      : "0" (nr),				\
++			        "D" ((long)dst),			\
++			        "S" ((long)src) );			\
++} while (0)
++#else
++#define COPY_DWORDS( dst, src, nr )		\
++do {						\
++   int j;					\
++   for ( j = 0 ; j < nr ; j++ )			\
++      dst[j] = ((int *)src)[j];			\
++   dst += nr;					\
++} while (0)
++#endif
++
++static void radeonEmitVec4(uint32_t *out, GLvoid * data, int stride, int count)
 +{
-+    struct cs_manager_legacy *csm = (struct cs_manager_legacy*)cs->csm;
-+    struct cs_reloc_legacy *relocs;
-+    int i, j, r;
++	int i;
 +
-+    csm = (struct cs_manager_legacy*)cs->csm;
-+    relocs = (struct cs_reloc_legacy *)cs->relocs;
-+ restart:
-+    for (i = 0; i < cs->crelocs; i++) {
-+        for (j = 0; j < relocs[i].cindices; j++) {
-+            uint32_t soffset, eoffset;
++	if (RADEON_DEBUG & DEBUG_VERTS)
++		fprintf(stderr, "%s count %d stride %d out %p data %p\n",
++			__FUNCTION__, count, stride, (void *)out, (void *)data);
 +
-+            r = radeon_bo_legacy_validate(relocs[i].base.bo,
-+                                           &soffset, &eoffset);
-+	    if (r == -EAGAIN)
-+	      goto restart;
-+            if (r) {
-+                fprintf(stderr, "validated %p [0x%08X, 0x%08X]\n",
-+                        relocs[i].base.bo, soffset, eoffset);
-+                return r;
-+            }
-+            cs->packets[relocs[i].indices[j]] += soffset;
-+            if (cs->packets[relocs[i].indices[j]] >= eoffset) {
-+	      /*                radeon_bo_debug(relocs[i].base.bo, 12); */
-+                fprintf(stderr, "validated %p [0x%08X, 0x%08X]\n",
-+                        relocs[i].base.bo, soffset, eoffset);
-+                fprintf(stderr, "above end: %p 0x%08X 0x%08X\n",
-+                        relocs[i].base.bo,
-+                        cs->packets[relocs[i].indices[j]],
-+                        eoffset);
-+                exit(0);
-+                return -EINVAL;
-+            }
-+        }
-+    }
-+    return 0;
++	if (stride == 4)
++		COPY_DWORDS(out, data, count);
++	else
++		for (i = 0; i < count; i++) {
++			out[0] = *(int *)data;
++			out++;
++			data += stride;
++		}
 +}
 +
-+static int cs_set_age(struct radeon_cs *cs)
++void radeonEmitVec8(uint32_t *out, GLvoid * data, int stride, int count)
 +{
-+    struct cs_manager_legacy *csm = (struct cs_manager_legacy*)cs->csm;
-+    struct cs_reloc_legacy *relocs;
-+    int i;
-+
-+    relocs = (struct cs_reloc_legacy *)cs->relocs;
-+    for (i = 0; i < cs->crelocs; i++) {
-+        radeon_bo_legacy_pending(relocs[i].base.bo, csm->pending_age);
-+        radeon_bo_unref(relocs[i].base.bo);
-+    }
-+    return 0;
-+}
++	int i;
 +
-+static void dump_cmdbuf(struct radeon_cs *cs)
-+{
-+  int i;
-+  for (i = 0; i < cs->cdw; i++){
-+    fprintf(stderr,"%x: %08x\n", i, cs->packets[i]);
-+  }
++	if (RADEON_DEBUG & DEBUG_VERTS)
++		fprintf(stderr, "%s count %d stride %d out %p data %p\n",
++			__FUNCTION__, count, stride, (void *)out, (void *)data);
 +
++	if (stride == 8)
++		COPY_DWORDS(out, data, count * 2);
++	else
++		for (i = 0; i < count; i++) {
++			out[0] = *(int *)data;
++			out[1] = *(int *)(data + 4);
++			out += 2;
++			data += stride;
++		}
 +}
-+static int cs_emit(struct radeon_cs *cs)
-+{
-+    struct cs_manager_legacy *csm = (struct cs_manager_legacy*)cs->csm;
-+    drm_radeon_cmd_buffer_t cmd;
-+    drm_r300_cmd_header_t age;
-+    uint64_t ull;
-+    int r;
 +
-+    csm->ctx->vtbl.emit_cs_header(cs, csm->ctx);
-+
-+    /* append buffer age */
-+    if (IS_R300_CLASS(csm->ctx->radeonScreen)) {
-+      age.scratch.cmd_type = R300_CMD_SCRATCH;
-+      /* Scratch register 2 corresponds to what radeonGetAge polls */
-+      csm->pending_age = 0;
-+      csm->pending_count = 1;
-+      ull = (uint64_t) (intptr_t) &csm->pending_age;
-+      age.scratch.reg = 2;
-+      age.scratch.n_bufs = 1;
-+      age.scratch.flags = 0;
-+      radeon_cs_write_dword(cs, age.u);
-+      radeon_cs_write_qword(cs, ull);
-+      radeon_cs_write_dword(cs, 0);
-+    }
++void radeonEmitVec12(uint32_t *out, GLvoid * data, int stride, int count)
++{
++	int i;
 +
-+    r = cs_process_relocs(cs);
-+    if (r) {
-+        return 0;
-+    }
++	if (RADEON_DEBUG & DEBUG_VERTS)
++		fprintf(stderr, "%s count %d stride %d out %p data %p\n",
++			__FUNCTION__, count, stride, (void *)out, (void *)data);
 +
-+    cmd.buf = (char *)cs->packets;
-+    cmd.bufsz = cs->cdw * 4;
-+    if (csm->ctx->state.scissor.enabled) {
-+        cmd.nbox = csm->ctx->state.scissor.numClipRects;
-+        cmd.boxes = (drm_clip_rect_t *) csm->ctx->state.scissor.pClipRects;
-+    } else {
-+        cmd.nbox = csm->ctx->numClipRects;
-+        cmd.boxes = (drm_clip_rect_t *) csm->ctx->pClipRects;
++	if (stride == 12) {
++		COPY_DWORDS(out, data, count * 3);
 +    }
++	else
++		for (i = 0; i < count; i++) {
++			out[0] = *(int *)data;
++			out[1] = *(int *)(data + 4);
++			out[2] = *(int *)(data + 8);
++			out += 3;
++			data += stride;
++		}
++}
 +
-+    //dump_cmdbuf(cs);
++static void radeonEmitVec16(uint32_t *out, GLvoid * data, int stride, int count)
++{
++	int i;
 +
-+    r = drmCommandWrite(cs->csm->fd, DRM_RADEON_CMDBUF, &cmd, sizeof(cmd));
-+    if (r) {
-+        return r;
-+    }
-+    if (!IS_R300_CLASS(csm->ctx->radeonScreen)) {
-+	drm_radeon_irq_emit_t emit_cmd;
-+	emit_cmd.irq_seq = &csm->pending_age;
-+	r = drmCommandWrite(cs->csm->fd, DRM_RADEON_IRQ_EMIT, &emit_cmd, sizeof(emit_cmd));
-+	if (r) {
-+		return r;
-+	}
-+    }
-+    cs_set_age(cs);
++	if (RADEON_DEBUG & DEBUG_VERTS)
++		fprintf(stderr, "%s count %d stride %d out %p data %p\n",
++			__FUNCTION__, count, stride, (void *)out, (void *)data);
 +
-+    cs->csm->read_used = 0;
-+    cs->csm->vram_write_used = 0;
-+    cs->csm->gart_write_used = 0;
-+    return 0;
++	if (stride == 16)
++		COPY_DWORDS(out, data, count * 4);
++	else
++		for (i = 0; i < count; i++) {
++			out[0] = *(int *)data;
++			out[1] = *(int *)(data + 4);
++			out[2] = *(int *)(data + 8);
++			out[3] = *(int *)(data + 12);
++			out += 4;
++			data += stride;
++		}
 +}
 +
-+static void inline cs_free_reloc(void *relocs_p, int crelocs)
++void rcommon_emit_vector(GLcontext * ctx, struct radeon_aos *aos,
++			 GLvoid * data, int size, int stride, int count)
 +{
-+    struct cs_reloc_legacy *relocs = relocs_p;
-+    int i;
-+    if (!relocs_p)
-+      return;
-+    for (i = 0; i < crelocs; i++)
-+      free(relocs[i].indices);
-+}
++	radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
++	uint32_t *out;
 +
-+static int cs_destroy(struct radeon_cs *cs)
-+{
-+    cs_free_reloc(cs->relocs, cs->crelocs);
-+    free(cs->relocs);
-+    free(cs->packets);
-+    free(cs);
-+    return 0;
-+}
++	if (stride == 0) {
++		radeonAllocDmaRegion(rmesa, &aos->bo, &aos->offset, size * 4, 32);
++		count = 1;
++		aos->stride = 0;
++	} else {
++		radeonAllocDmaRegion(rmesa, &aos->bo, &aos->offset, size * count * 4, 32);
++		aos->stride = size;
++	}
 +
-+static int cs_erase(struct radeon_cs *cs)
-+{
-+    cs_free_reloc(cs->relocs, cs->crelocs);
-+    free(cs->relocs);
-+    cs->relocs_total_size = 0;
-+    cs->relocs = NULL;
-+    cs->crelocs = 0;
-+    cs->cdw = 0;
-+    cs->section = 0;
-+    return 0;
-+}
++	aos->components = size;
++	aos->count = count;
 +
-+static int cs_need_flush(struct radeon_cs *cs)
-+{
-+    /* this function used to flush when the BO usage got to
-+     * a certain size, now the higher levels handle this better */
-+    return 0;
++	out = (uint32_t*)((char*)aos->bo->ptr + aos->offset);
++	switch (size) {
++	case 1: radeonEmitVec4(out, data, stride, count); break;
++	case 2: radeonEmitVec8(out, data, stride, count); break;
++	case 3: radeonEmitVec12(out, data, stride, count); break;
++	case 4: radeonEmitVec16(out, data, stride, count); break;
++	default:
++		assert(0);
++		break;
++	}
 +}
 +
-+static void cs_print(struct radeon_cs *cs, FILE *file)
++void radeonRefillCurrentDmaRegion(radeonContextPtr rmesa, int size)
 +{
-+}
++	struct radeon_cs_space_check bos[1];
++	int flushed = 0, ret;
 +
-+static int cs_check_space(struct radeon_cs *cs, struct radeon_cs_space_check *bos, int num_bo)
-+{
-+    struct radeon_cs_manager *csm = cs->csm;
-+    int this_op_read = 0, this_op_gart_write = 0, this_op_vram_write = 0;
-+    uint32_t read_domains, write_domain;
-+    int i;
-+    struct radeon_bo *bo;
++	size = MAX2(size, MAX_DMA_BUF_SZ * 16);
 +
-+    /* check the totals for this operation */
++	if (RADEON_DEBUG & (DEBUG_IOCTL | DEBUG_DMA))
++		fprintf(stderr, "%s\n", __FUNCTION__);
 +
-+    if (num_bo == 0)
-+        return 0;
++	if (rmesa->dma.flush) {
++		rmesa->dma.flush(rmesa->glCtx);
++	}
 +
-+    /* prepare */
-+    for (i = 0; i < num_bo; i++) {
-+      bo = bos[i].bo;
++	if (rmesa->dma.nr_released_bufs > 4) {
++		rcommonFlushCmdBuf(rmesa, __FUNCTION__);
++		rmesa->dma.nr_released_bufs = 0;
++	}
 +
-+      bos[i].new_accounted = 0;
-+      read_domains = bos[i].read_domains;
-+      write_domain = bos[i].write_domain;
-+		
-+      /* pinned bos don't count */
-+      if (radeon_legacy_bo_is_static(bo))
-+	  continue;
-+ 
-+      /* already accounted this bo */
-+      if (write_domain && (write_domain == bo->space_accounted))
-+	  continue;
++	if (rmesa->dma.current) {
++		radeon_bo_unmap(rmesa->dma.current);
++		radeon_bo_unref(rmesa->dma.current);
++		rmesa->dma.current = 0;
++	}
 +
-+      if (read_domains && ((read_domains << 16) == bo->space_accounted))
-+	  continue;
-+      
-+      if (bo->space_accounted == 0) {
-+	  if (write_domain == RADEON_GEM_DOMAIN_VRAM)
-+	      this_op_vram_write += bo->size;
-+	  else if (write_domain == RADEON_GEM_DOMAIN_GTT)
-+	      this_op_gart_write += bo->size;
-+	  else
-+	      this_op_read += bo->size;
-+	  bos[i].new_accounted = (read_domains << 16) | write_domain;
-+      } else {
-+	  uint16_t old_read, old_write;
-+	  
-+	  old_read = bo->space_accounted >> 16;
-+	  old_write = bo->space_accounted & 0xffff;
++again_alloc:	
++	rmesa->dma.current = radeon_bo_open(rmesa->radeonScreen->bom,
++					    0, size, 4, RADEON_GEM_DOMAIN_GTT,
++					    0);
 +
-+	  if (write_domain && (old_read & write_domain)) {
-+	      bos[i].new_accounted = write_domain;
-+	      /* moving from read to a write domain */
-+	      if (write_domain == RADEON_GEM_DOMAIN_VRAM) {
-+		  this_op_read -= bo->size;
-+		  this_op_vram_write += bo->size;
-+	      } else if (write_domain == RADEON_GEM_DOMAIN_VRAM) {
-+		  this_op_read -= bo->size;
-+		  this_op_gart_write += bo->size;
-+	      }
-+	  } else if (read_domains & old_write) {
-+	      bos[i].new_accounted = bo->space_accounted & 0xffff;
-+	  } else {
-+	      /* rewrite the domains */
-+	      if (write_domain != old_write)
-+		  fprintf(stderr,"WRITE DOMAIN RELOC FAILURE 0x%x %d %d\n", bo->handle, write_domain, old_write);
-+	      if (read_domains != old_read)
-+		  fprintf(stderr,"READ DOMAIN RELOC FAILURE 0x%x %d %d\n", bo->handle, read_domains, old_read);
-+	      return RADEON_CS_SPACE_FLUSH;
-+	  }
-+      }
++	if (!rmesa->dma.current) {
++		rcommonFlushCmdBuf(rmesa, __FUNCTION__);
++		rmesa->dma.nr_released_bufs = 0;
++		goto again_alloc;
 +	}
++
++	rmesa->dma.current_used = 0;
++	rmesa->dma.current_vertexptr = 0;
 +	
-+	if (this_op_read < 0)
-+		this_op_read = 0;
++	bos[0].bo = rmesa->dma.current;
++	bos[0].read_domains = RADEON_GEM_DOMAIN_GTT;
++	bos[0].write_domain =0 ;
++	bos[0].new_accounted = 0;
 +
-+	/* check sizes - operation first */
-+	if ((this_op_read + this_op_gart_write > csm->gart_limit) ||
-+	    (this_op_vram_write > csm->vram_limit)) {
-+	    return RADEON_CS_SPACE_OP_TO_BIG;
++	ret = radeon_cs_space_check(rmesa->cmdbuf.cs, bos, 1);
++	if (ret == RADEON_CS_SPACE_OP_TO_BIG) {
++		fprintf(stderr,"Got OPEARTION TO BIG ILLEGAL - this cannot happen");
++		assert(0);
++	} else if (ret == RADEON_CS_SPACE_FLUSH) {
++		rcommonFlushCmdBuf(rmesa, __FUNCTION__);
++		if (flushed) {
++			fprintf(stderr,"flushed but still no space\n");
++			assert(0);
++		}
++		flushed = 1;
++		goto again_alloc;
 +	}
++	radeon_bo_map(rmesa->dma.current, 1);
++}
 +
-+	if (((csm->vram_write_used + this_op_vram_write) > csm->vram_limit) ||
-+	    ((csm->read_used + csm->gart_write_used + this_op_gart_write + this_op_read) > csm->gart_limit)) {
-+		return RADEON_CS_SPACE_FLUSH;
-+	}
++/* Allocates a region from rmesa->dma.current.  If there isn't enough
++ * space in current, grab a new buffer (and discard what was left of current)
++ */
++void radeonAllocDmaRegion(radeonContextPtr rmesa,
++			  struct radeon_bo **pbo, int *poffset,
++			  int bytes, int alignment)
++{
++	if (RADEON_DEBUG & DEBUG_IOCTL)
++		fprintf(stderr, "%s %d\n", __FUNCTION__, bytes);
 +
-+	csm->gart_write_used += this_op_gart_write;
-+	csm->vram_write_used += this_op_vram_write;
-+	csm->read_used += this_op_read;
-+	/* commit */
-+	for (i = 0; i < num_bo; i++) {
-+		bo = bos[i].bo;
-+		bo->space_accounted = bos[i].new_accounted;
-+	}
++	if (rmesa->dma.flush)
++		rmesa->dma.flush(rmesa->glCtx);
 +
-+	return RADEON_CS_SPACE_OK;
-+}
++	assert(rmesa->dma.current_used == rmesa->dma.current_vertexptr);
 +
-+static struct radeon_cs_funcs  radeon_cs_legacy_funcs = {
-+    cs_create,
-+    cs_write_reloc,
-+    cs_begin,
-+    cs_end,
-+    cs_emit,
-+    cs_destroy,
-+    cs_erase,
-+    cs_need_flush,
-+    cs_print,
-+    cs_check_space
-+};
++	alignment--;
++	rmesa->dma.current_used = (rmesa->dma.current_used + alignment) & ~alignment;
 +
-+struct radeon_cs_manager *radeon_cs_manager_legacy_ctor(struct radeon_context *ctx)
-+{
-+    struct cs_manager_legacy *csm;
++	if (!rmesa->dma.current || rmesa->dma.current_used + bytes > rmesa->dma.current->size)
++		radeonRefillCurrentDmaRegion(rmesa, (bytes + 15) & ~15);
 +
-+    csm = (struct cs_manager_legacy*)
-+          calloc(1, sizeof(struct cs_manager_legacy));
-+    if (csm == NULL) {
-+        return NULL;
-+    }
-+    csm->base.funcs = &radeon_cs_legacy_funcs;
-+    csm->base.fd = ctx->dri.fd;
-+    csm->ctx = ctx;
-+    csm->pending_age = 1;
-+    return (struct radeon_cs_manager*)csm;
++	*poffset = rmesa->dma.current_used;
++	*pbo = rmesa->dma.current;
++	radeon_bo_ref(*pbo);
++
++	/* Always align to at least 16 bytes */
++	rmesa->dma.current_used = (rmesa->dma.current_used + bytes + 15) & ~15;
++	rmesa->dma.current_vertexptr = rmesa->dma.current_used;
++
++	assert(rmesa->dma.current_used <= rmesa->dma.current->size);
 +}
 +
-+void radeon_cs_manager_legacy_dtor(struct radeon_cs_manager *csm)
++void radeonReleaseDmaRegion(radeonContextPtr rmesa)
 +{
-+    free(csm);
++	if (RADEON_DEBUG & DEBUG_IOCTL)
++		fprintf(stderr, "%s %p\n", __FUNCTION__, rmesa->dma.current);
++	if (rmesa->dma.current) {
++		rmesa->dma.nr_released_bufs++;
++		radeon_bo_unmap(rmesa->dma.current);
++	        radeon_bo_unref(rmesa->dma.current);
++	}
++	rmesa->dma.current = NULL;
 +}
 +
-diff --git a/src/mesa/drivers/dri/radeon/radeon_cs_legacy.h b/src/mesa/drivers/dri/radeon/radeon_cs_legacy.h
-new file mode 100644
-index 0000000..e177b4b
---- /dev/null
-+++ b/src/mesa/drivers/dri/radeon/radeon_cs_legacy.h
-@@ -0,0 +1,40 @@
-+/* 
-+ * Copyright © 2008 Nicolai Haehnle
-+ * Copyright © 2008 Jérôme Glisse
-+ * All Rights Reserved.
-+ * 
-+ * Permission is hereby granted, free of charge, to any person obtaining a
-+ * copy of this software and associated documentation files (the
-+ * "Software"), to deal in the Software without restriction, including
-+ * without limitation the rights to use, copy, modify, merge, publish,
-+ * distribute, sub license, and/or sell copies of the Software, and to
-+ * permit persons to whom the Software is furnished to do so, subject to
-+ * the following conditions:
-+ * 
-+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
-+ * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
-+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR 
-+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE 
-+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
-+ *
-+ * The above copyright notice and this permission notice (including the
-+ * next paragraph) shall be included in all copies or substantial portions
-+ * of the Software.
++
++/* Flush vertices in the current dma region.
 + */
-+/*
-+ * Authors:
-+ *      Aapo Tahkola <aet@rasterburn.org>
-+ *      Nicolai Haehnle <prefect_@gmx.net>
-+ *      Jérôme Glisse <glisse@freedesktop.org>
++void rcommon_flush_last_swtcl_prim( GLcontext *ctx  )
++{
++	radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
++	struct radeon_dma *dma = &rmesa->dma;
++		
++
++	if (RADEON_DEBUG & DEBUG_IOCTL)
++		fprintf(stderr, "%s\n", __FUNCTION__);
++	dma->flush = NULL;
++
++	if (dma->current) {
++	    GLuint current_offset = dma->current_used;
++
++	    assert (dma->current_used +
++		    rmesa->swtcl.numverts * rmesa->swtcl.vertex_size * 4 ==
++		    dma->current_vertexptr);
++
++	    if (dma->current_used != dma->current_vertexptr) {
++		    dma->current_used = dma->current_vertexptr;
++
++		    rmesa->vtbl.swtcl_flush(ctx, current_offset);
++	    }
++	    rmesa->swtcl.numverts = 0;
++	}
++}
++/* Alloc space in the current dma region.
 + */
-+#ifndef RADEON_CS_LEGACY_H
-+#define RADEON_CS_LEGACY_H
++void *
++rcommonAllocDmaLowVerts( radeonContextPtr rmesa, int nverts, int vsize )
++{
++	GLuint bytes = vsize * nverts;
++	void *head;
 +
-+#include "radeon_common.h"
++	if (!rmesa->dma.current || rmesa->dma.current_vertexptr + bytes > rmesa->dma.current->size) {
++                radeonRefillCurrentDmaRegion(rmesa, bytes);
++	}
 +
-+struct radeon_cs_manager *radeon_cs_manager_legacy_ctor(struct radeon_context *ctx);
-+void radeon_cs_manager_legacy_dtor(struct radeon_cs_manager *csm);
++        if (!rmesa->dma.flush) {
++                rmesa->glCtx->Driver.NeedFlush |= FLUSH_STORED_VERTICES;
++                rmesa->dma.flush = rcommon_flush_last_swtcl_prim;
++        }
 +
-+#endif
-diff --git a/src/mesa/drivers/dri/radeon/radeon_dma.c b/src/mesa/drivers/dri/radeon/radeon_dma.c
++	ASSERT( vsize == rmesa->swtcl.vertex_size * 4 );
++        ASSERT( rmesa->dma.flush == rcommon_flush_last_swtcl_prim );
++        ASSERT( rmesa->dma.current_used +
++                rmesa->swtcl.numverts * rmesa->swtcl.vertex_size * 4 ==
++                rmesa->dma.current_vertexptr );
++
++	head = (rmesa->dma.current->ptr + rmesa->dma.current_vertexptr);
++	rmesa->dma.current_vertexptr += bytes;
++	rmesa->swtcl.numverts += nverts;
++	return head;
++}
+diff --git a/src/mesa/drivers/dri/radeon/radeon_dma.h b/src/mesa/drivers/dri/radeon/radeon_dma.h
 new file mode 100644
-index 0000000..393b121
+index 0000000..cee3744
 --- /dev/null
-+++ b/src/mesa/drivers/dri/radeon/radeon_dma.c
-@@ -0,0 +1,323 @@
++++ b/src/mesa/drivers/dri/radeon/radeon_dma.h
+@@ -0,0 +1,51 @@
 +/**************************************************************************
 +
 +Copyright (C) 2004 Nicolai Haehnle.
@@ -4168,360 +25368,2871 @@ index 0000000..393b121
 +
 +**************************************************************************/
 +
-+#include "radeon_common.h"
++#ifndef RADEON_DMA_H
++#define RADEON_DMA_H
 +
-+#if defined(USE_X86_ASM)
-+#define COPY_DWORDS( dst, src, nr )					\
-+do {									\
-+	int __tmp;							\
-+	__asm__ __volatile__( "rep ; movsl"				\
-+			      : "=%c" (__tmp), "=D" (dst), "=S" (__tmp)	\
-+			      : "0" (nr),				\
-+			        "D" ((long)dst),			\
-+			        "S" ((long)src) );			\
-+} while (0)
-+#else
-+#define COPY_DWORDS( dst, src, nr )		\
-+do {						\
-+   int j;					\
-+   for ( j = 0 ; j < nr ; j++ )			\
-+      dst[j] = ((int *)src)[j];			\
-+   dst += nr;					\
-+} while (0)
++void radeonEmitVec8(uint32_t *out, GLvoid * data, int stride, int count);
++void radeonEmitVec12(uint32_t *out, GLvoid * data, int stride, int count);
++
++void rcommon_emit_vector(GLcontext * ctx, struct radeon_aos *aos,
++			 GLvoid * data, int size, int stride, int count);
++
++void radeonRefillCurrentDmaRegion(radeonContextPtr rmesa, int size);
++void radeonAllocDmaRegion(radeonContextPtr rmesa,
++			  struct radeon_bo **pbo, int *poffset,
++			  int bytes, int alignment);
++void radeonReleaseDmaRegion(radeonContextPtr rmesa);
++
++void rcommon_flush_last_swtcl_prim(GLcontext *ctx);
++
++void *rcommonAllocDmaLowVerts(radeonContextPtr rmesa, int nverts, int vsize);
++#endif
+diff --git a/src/mesa/drivers/dri/radeon/radeon_ioctl.c b/src/mesa/drivers/dri/radeon/radeon_ioctl.c
+index 09acf6b..b5ab923 100644
+--- a/src/mesa/drivers/dri/radeon/radeon_ioctl.c
++++ b/src/mesa/drivers/dri/radeon/radeon_ioctl.c
+@@ -43,6 +43,7 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ #include "swrast/swrast.h"
+ 
+ #include "radeon_context.h"
++#include "radeon_common.h"
+ #include "radeon_state.h"
+ #include "radeon_ioctl.h"
+ #include "radeon_tcl.h"
+@@ -58,75 +59,6 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ #define RADEON_IDLE_RETRY           16
+ 
+ 
+-static void radeonWaitForIdle( radeonContextPtr rmesa );
+-static int radeonFlushCmdBufLocked( radeonContextPtr rmesa, 
+-				    const char * caller );
+-
+-static void print_state_atom( struct radeon_state_atom *state )
+-{
+-   int i;
+-
+-   fprintf(stderr, "emit %s/%d\n", state->name, state->cmd_size);
+-
+-   if (RADEON_DEBUG & DEBUG_VERBOSE) 
+-      for (i = 0 ; i < state->cmd_size ; i++) 
+-	 fprintf(stderr, "\t%s[%d]: %x\n", state->name, i, state->cmd[i]);
+-
+-}
+-
+-static void radeonSaveHwState( radeonContextPtr rmesa )
+-{
+-   struct radeon_state_atom *atom;
+-   char * dest = rmesa->backup_store.cmd_buf;
+-
+-   if (RADEON_DEBUG & DEBUG_STATE)
+-      fprintf(stderr, "%s\n", __FUNCTION__);
+-   
+-   rmesa->backup_store.cmd_used = 0;
+-
+-   foreach( atom, &rmesa->hw.atomlist ) {
+-      if ( atom->check( rmesa->glCtx ) ) {
+-	 int size = atom->cmd_size * 4;
+-	 memcpy( dest, atom->cmd, size);
+-	 dest += size;
+-	 rmesa->backup_store.cmd_used += size;
+-	 if (RADEON_DEBUG & DEBUG_STATE)
+-	    print_state_atom( atom );
+-      }
+-   }
+-
+-   assert( rmesa->backup_store.cmd_used <= RADEON_CMD_BUF_SZ );
+-   if (RADEON_DEBUG & DEBUG_STATE)
+-      fprintf(stderr, "Returning to radeonEmitState\n");
+-}
+-
+-/* At this point we were in FlushCmdBufLocked but we had lost our context, so
+- * we need to unwire our current cmdbuf, hook the one with the saved state in
+- * it, flush it, and then put the current one back.  This is so commands at the
+- * start of a cmdbuf can rely on the state being kept from the previous one.
+- */
+-static void radeonBackUpAndEmitLostStateLocked( radeonContextPtr rmesa )
+-{
+-   GLuint nr_released_bufs;
+-   struct radeon_store saved_store;
+-
+-   if (rmesa->backup_store.cmd_used == 0)
+-      return;
+-
+-   if (RADEON_DEBUG & DEBUG_STATE)
+-      fprintf(stderr, "Emitting backup state on lost context\n");
+-
+-   rmesa->lost_context = GL_FALSE;
+-
+-   nr_released_bufs = rmesa->dma.nr_released_bufs;
+-   saved_store = rmesa->store;
+-   rmesa->dma.nr_released_bufs = 0;
+-   rmesa->store = rmesa->backup_store;
+-   radeonFlushCmdBufLocked( rmesa, __FUNCTION__ );
+-   rmesa->dma.nr_released_bufs = nr_released_bufs;
+-   rmesa->store = saved_store;
+-}
+-
+ /* =============================================================
+  * Kernel command buffer handling
+  */
+@@ -134,893 +66,340 @@ static void radeonBackUpAndEmitLostStateLocked( radeonContextPtr rmesa )
+ /* The state atoms will be emitted in the order they appear in the atom list,
+  * so this step is important.
+  */
+-void radeonSetUpAtomList( radeonContextPtr rmesa )
++void radeonSetUpAtomList( r100ContextPtr rmesa )
+ {
+-   int i, mtu = rmesa->glCtx->Const.MaxTextureUnits;
+-
+-   make_empty_list(&rmesa->hw.atomlist);
+-   rmesa->hw.atomlist.name = "atom-list";
+-
+-   insert_at_tail(&rmesa->hw.atomlist, &rmesa->hw.ctx);
+-   insert_at_tail(&rmesa->hw.atomlist, &rmesa->hw.set);
+-   insert_at_tail(&rmesa->hw.atomlist, &rmesa->hw.lin);
+-   insert_at_tail(&rmesa->hw.atomlist, &rmesa->hw.msk);
+-   insert_at_tail(&rmesa->hw.atomlist, &rmesa->hw.vpt);
+-   insert_at_tail(&rmesa->hw.atomlist, &rmesa->hw.tcl);
+-   insert_at_tail(&rmesa->hw.atomlist, &rmesa->hw.msc);
++   int i, mtu = rmesa->radeon.glCtx->Const.MaxTextureUnits;
++
++   make_empty_list(&rmesa->radeon.hw.atomlist);
++   rmesa->radeon.hw.atomlist.name = "atom-list";
++
++   insert_at_tail(&rmesa->radeon.hw.atomlist, &rmesa->hw.ctx);
++   insert_at_tail(&rmesa->radeon.hw.atomlist, &rmesa->hw.set);
++   insert_at_tail(&rmesa->radeon.hw.atomlist, &rmesa->hw.lin);
++   insert_at_tail(&rmesa->radeon.hw.atomlist, &rmesa->hw.msk);
++   insert_at_tail(&rmesa->radeon.hw.atomlist, &rmesa->hw.vpt);
++   insert_at_tail(&rmesa->radeon.hw.atomlist, &rmesa->hw.tcl);
++   insert_at_tail(&rmesa->radeon.hw.atomlist, &rmesa->hw.msc);
+    for (i = 0; i < mtu; ++i) {
+-       insert_at_tail(&rmesa->hw.atomlist, &rmesa->hw.tex[i]);
+-       insert_at_tail(&rmesa->hw.atomlist, &rmesa->hw.txr[i]);
+-       insert_at_tail(&rmesa->hw.atomlist, &rmesa->hw.cube[i]);
++       insert_at_tail(&rmesa->radeon.hw.atomlist, &rmesa->hw.tex[i]);
++       insert_at_tail(&rmesa->radeon.hw.atomlist, &rmesa->hw.txr[i]);
++       insert_at_tail(&rmesa->radeon.hw.atomlist, &rmesa->hw.cube[i]);
+    }
+-   insert_at_tail(&rmesa->hw.atomlist, &rmesa->hw.zbs);
+-   insert_at_tail(&rmesa->hw.atomlist, &rmesa->hw.mtl);
++   insert_at_tail(&rmesa->radeon.hw.atomlist, &rmesa->hw.zbs);
++   insert_at_tail(&rmesa->radeon.hw.atomlist, &rmesa->hw.mtl);
+    for (i = 0; i < 3 + mtu; ++i)
+-      insert_at_tail(&rmesa->hw.atomlist, &rmesa->hw.mat[i]);
++      insert_at_tail(&rmesa->radeon.hw.atomlist, &rmesa->hw.mat[i]);
+    for (i = 0; i < 8; ++i)
+-      insert_at_tail(&rmesa->hw.atomlist, &rmesa->hw.lit[i]);
++      insert_at_tail(&rmesa->radeon.hw.atomlist, &rmesa->hw.lit[i]);
+    for (i = 0; i < 6; ++i)
+-      insert_at_tail(&rmesa->hw.atomlist, &rmesa->hw.ucp[i]);
+-   insert_at_tail(&rmesa->hw.atomlist, &rmesa->hw.eye);
+-   insert_at_tail(&rmesa->hw.atomlist, &rmesa->hw.grd);
+-   insert_at_tail(&rmesa->hw.atomlist, &rmesa->hw.fog);
+-   insert_at_tail(&rmesa->hw.atomlist, &rmesa->hw.glt);
+-}
+-
+-void radeonEmitState( radeonContextPtr rmesa )
+-{
+-   struct radeon_state_atom *atom;
+-   char *dest;
+-
+-   if (RADEON_DEBUG & (DEBUG_STATE|DEBUG_PRIMS))
+-      fprintf(stderr, "%s\n", __FUNCTION__);
+-
+-   if (rmesa->save_on_next_emit) {
+-      radeonSaveHwState(rmesa);
+-      rmesa->save_on_next_emit = GL_FALSE;
+-   }
+-
+-   /* this code used to return here but now it emits zbs */
+-
+-   /* To avoid going across the entire set of states multiple times, just check
+-    * for enough space for the case of emitting all state, and inline the
+-    * radeonAllocCmdBuf code here without all the checks.
+-    */
+-   radeonEnsureCmdBufSpace(rmesa, rmesa->hw.max_state_size);
+-   dest = rmesa->store.cmd_buf + rmesa->store.cmd_used;
+-
+-   /* We always always emit zbs, this is due to a bug found by keithw in
+-      the hardware and rediscovered after Erics changes by me.
+-      if you ever touch this code make sure you emit zbs otherwise
+-      you get tcl lockups on at least M7/7500 class of chips - airlied */
+-   rmesa->hw.zbs.dirty=1;
+-
+-   if (RADEON_DEBUG & DEBUG_STATE) {
+-      foreach(atom, &rmesa->hw.atomlist) {
+-	 if (atom->dirty || rmesa->hw.all_dirty) {
+-	    if (atom->check(rmesa->glCtx))
+-	       print_state_atom(atom);
+-	    else
+-	       fprintf(stderr, "skip state %s\n", atom->name);
+-	 }
+-      }
+-   }
+-
+-   foreach(atom, &rmesa->hw.atomlist) {
+-      if (rmesa->hw.all_dirty)
+-	 atom->dirty = GL_TRUE;
+-      if (!(rmesa->radeonScreen->chip_flags & RADEON_CHIPSET_TCL) &&
+-	   atom->is_tcl)
+-	 atom->dirty = GL_FALSE;
+-      if (atom->dirty) {
+-	 if (atom->check(rmesa->glCtx)) {
+-	    int size = atom->cmd_size * 4;
+-	    memcpy(dest, atom->cmd, size);
+-	    dest += size;
+-	    rmesa->store.cmd_used += size;
+-	    atom->dirty = GL_FALSE;
+-	 }
+-      }
+-   }
+-
+-   assert(rmesa->store.cmd_used <= RADEON_CMD_BUF_SZ);
+- 
+-   rmesa->hw.is_dirty = GL_FALSE;
+-   rmesa->hw.all_dirty = GL_FALSE;
++      insert_at_tail(&rmesa->radeon.hw.atomlist, &rmesa->hw.ucp[i]);
++   insert_at_tail(&rmesa->radeon.hw.atomlist, &rmesa->hw.eye);
++   insert_at_tail(&rmesa->radeon.hw.atomlist, &rmesa->hw.grd);
++   insert_at_tail(&rmesa->radeon.hw.atomlist, &rmesa->hw.fog);
++   insert_at_tail(&rmesa->radeon.hw.atomlist, &rmesa->hw.glt);
+ }
+ 
+ /* Fire a section of the retained (indexed_verts) buffer as a regular
+  * primtive.  
+  */
+-extern void radeonEmitVbufPrim( radeonContextPtr rmesa,
++extern void radeonEmitVbufPrim( r100ContextPtr rmesa,
+ 				GLuint vertex_format,
+ 				GLuint primitive,
+ 				GLuint vertex_nr )
+ {
+-   drm_radeon_cmd_header_t *cmd;
+-
++   BATCH_LOCALS(&rmesa->radeon);
+ 
+    assert(!(primitive & RADEON_CP_VC_CNTL_PRIM_WALK_IND));
+    
+-   radeonEmitState( rmesa );
++   radeonEmitState(&rmesa->radeon);
+ 
+-   if (RADEON_DEBUG & DEBUG_IOCTL)
+-      fprintf(stderr, "%s cmd_used/4: %d\n", __FUNCTION__,
+-	      rmesa->store.cmd_used/4);
+-   
+-   cmd = (drm_radeon_cmd_header_t *)radeonAllocCmdBuf( rmesa, VBUF_BUFSZ,
+-						       __FUNCTION__ );
+ #if RADEON_OLD_PACKETS
+-   cmd[0].i = 0;
+-   cmd[0].header.cmd_type = RADEON_CMD_PACKET3_CLIP;
+-   cmd[1].i = RADEON_CP_PACKET3_3D_RNDR_GEN_INDX_PRIM | (3 << 16);
+-   cmd[2].i = rmesa->ioctl.vertex_offset;
+-   cmd[3].i = vertex_nr;
+-   cmd[4].i = vertex_format;
+-   cmd[5].i = (primitive | 
+-	       RADEON_CP_VC_CNTL_PRIM_WALK_LIST |
+-	       RADEON_CP_VC_CNTL_COLOR_ORDER_RGBA |
+-	       RADEON_CP_VC_CNTL_VTX_FMT_RADEON_MODE |
+-	       (vertex_nr << RADEON_CP_VC_CNTL_NUM_SHIFT));
+-
+-   if (RADEON_DEBUG & DEBUG_PRIMS)
+-      fprintf(stderr, "%s: header 0x%x offt 0x%x vfmt 0x%x vfcntl %x \n",
+-	      __FUNCTION__,
+-	      cmd[1].i, cmd[2].i, cmd[4].i, cmd[5].i);
+-#else
+-   cmd[0].i = 0;
+-   cmd[0].header.cmd_type = RADEON_CMD_PACKET3_CLIP;
+-   cmd[1].i = RADEON_CP_PACKET3_3D_DRAW_VBUF | (1 << 16);
+-   cmd[2].i = vertex_format;
+-   cmd[3].i = (primitive | 
+-	       RADEON_CP_VC_CNTL_PRIM_WALK_LIST |
+-	       RADEON_CP_VC_CNTL_COLOR_ORDER_RGBA |
+-	       RADEON_CP_VC_CNTL_MAOS_ENABLE |
+-	       RADEON_CP_VC_CNTL_VTX_FMT_RADEON_MODE |
+-	       (vertex_nr << RADEON_CP_VC_CNTL_NUM_SHIFT));
+-
+-
+-   if (RADEON_DEBUG & DEBUG_PRIMS)
+-      fprintf(stderr, "%s: header 0x%x vfmt 0x%x vfcntl %x \n",
+-	      __FUNCTION__,
+-	      cmd[1].i, cmd[2].i, cmd[3].i);
++   BEGIN_BATCH(8);
++   OUT_BATCH_PACKET3_CLIP(RADEON_CP_PACKET3_3D_RNDR_GEN_INDX_PRIM, 3);
++   if (!rmesa->radeon.radeonScreen->kernel_mm) {
++     OUT_BATCH_RELOC(rmesa->ioctl.vertex_offset, rmesa->ioctl.bo, rmesa->ioctl.vertex_offset, RADEON_GEM_DOMAIN_GTT, 0, 0);
++   } else {
++     OUT_BATCH(rmesa->ioctl.vertex_offset);
++   }
++    
++   OUT_BATCH(vertex_nr);
++   OUT_BATCH(vertex_format);
++   OUT_BATCH(primitive |  RADEON_CP_VC_CNTL_PRIM_WALK_LIST |
++	     RADEON_CP_VC_CNTL_COLOR_ORDER_RGBA |
++	     RADEON_CP_VC_CNTL_VTX_FMT_RADEON_MODE |
++	     (vertex_nr << RADEON_CP_VC_CNTL_NUM_SHIFT));
++
++   if (rmesa->radeon.radeonScreen->kernel_mm) {
++     radeon_cs_write_reloc(rmesa->radeon.cmdbuf.cs,
++			   rmesa->ioctl.bo,
++			   RADEON_GEM_DOMAIN_GTT,
++			   0, 0);
++   }
++   
++   END_BATCH();
++   
++#else   
++   BEGIN_BATCH(4);
++   OUT_BATCH_PACKET3_CLIP(RADEON_CP_PACKET3_3D_DRAW_VBUF, 1);
++   OUT_BATCH(vertex_format);
++   OUT_BATCH(primitive |
++	     RADEON_CP_VC_CNTL_PRIM_WALK_LIST |
++	     RADEON_CP_VC_CNTL_COLOR_ORDER_RGBA |
++	     RADEON_CP_VC_CNTL_MAOS_ENABLE |
++	     RADEON_CP_VC_CNTL_VTX_FMT_RADEON_MODE |
++	     (vertex_nr << RADEON_CP_VC_CNTL_NUM_SHIFT));
++   END_BATCH();
+ #endif
+ }
+ 
+-
+-void radeonFlushElts( radeonContextPtr rmesa )
++void radeonFlushElts( GLcontext *ctx )
+ {
+-   int *cmd = (int *)(rmesa->store.cmd_buf + rmesa->store.elts_start);
+-   int dwords;
+-#if RADEON_OLD_PACKETS
+-   int nr = (rmesa->store.cmd_used - (rmesa->store.elts_start + 24)) / 2;
+-#else
+-   int nr = (rmesa->store.cmd_used - (rmesa->store.elts_start + 16)) / 2;
+-#endif
+-
++   r100ContextPtr rmesa = R100_CONTEXT(ctx);
++   BATCH_LOCALS(&rmesa->radeon);
++   int nr;
++   uint32_t *cmd = (uint32_t *)(rmesa->radeon.cmdbuf.cs->packets + rmesa->tcl.elt_cmd_start);
++   int dwords = (rmesa->radeon.cmdbuf.cs->section_ndw - rmesa->radeon.cmdbuf.cs->section_cdw);
++   
+    if (RADEON_DEBUG & DEBUG_IOCTL)
+       fprintf(stderr, "%s\n", __FUNCTION__);
+ 
+-   assert( rmesa->dma.flush == radeonFlushElts );
+-   rmesa->dma.flush = NULL;
++   assert( rmesa->radeon.dma.flush == radeonFlushElts );
++   rmesa->radeon.dma.flush = NULL;
+ 
+-   /* Cope with odd number of elts:
+-    */
+-   rmesa->store.cmd_used = (rmesa->store.cmd_used + 2) & ~2;
+-   dwords = (rmesa->store.cmd_used - rmesa->store.elts_start) / 4;
++   nr = rmesa->tcl.elt_used;
+ 
+ #if RADEON_OLD_PACKETS
+-   cmd[1] |= (dwords - 3) << 16;
++   if (rmesa->radeon.radeonScreen->kernel_mm) {
++     dwords -= 2;
++   }
 +#endif
 +
-+static void radeonEmitVec4(uint32_t *out, GLvoid * data, int stride, int count)
-+{
-+	int i;
++#if RADEON_OLD_PACKETS
++   cmd[1] |= (dwords + 3) << 16;
+    cmd[5] |= nr << RADEON_CP_VC_CNTL_NUM_SHIFT;
+ #else
+-   cmd[1] |= (dwords - 3) << 16;
++   cmd[1] |= (dwords + 2) << 16;
+    cmd[3] |= nr << RADEON_CP_VC_CNTL_NUM_SHIFT;
+ #endif
+ 
++   rmesa->radeon.cmdbuf.cs->cdw += dwords;
++   rmesa->radeon.cmdbuf.cs->section_cdw += dwords;
++
++#if RADEON_OLD_PACKETS
++   if (rmesa->radeon.radeonScreen->kernel_mm) {
++      radeon_cs_write_reloc(rmesa->radeon.cmdbuf.cs,
++			    rmesa->ioctl.bo,
++			    RADEON_GEM_DOMAIN_GTT,
++			    0, 0);
++   }
++#endif
 +
-+	if (RADEON_DEBUG & DEBUG_VERTS)
-+		fprintf(stderr, "%s count %d stride %d out %p data %p\n",
-+			__FUNCTION__, count, stride, (void *)out, (void *)data);
++   END_BATCH();
 +
-+	if (stride == 4)
-+		COPY_DWORDS(out, data, count);
-+	else
-+		for (i = 0; i < count; i++) {
-+			out[0] = *(int *)data;
-+			out++;
-+			data += stride;
-+		}
+    if (RADEON_DEBUG & DEBUG_SYNC) {
+       fprintf(stderr, "%s: Syncing\n", __FUNCTION__);
+-      radeonFinish( rmesa->glCtx );
++      radeonFinish( rmesa->radeon.glCtx );
+    }
+-}
+ 
 +}
+ 
+-GLushort *radeonAllocEltsOpenEnded( radeonContextPtr rmesa,
++GLushort *radeonAllocEltsOpenEnded( r100ContextPtr rmesa,
+ 				    GLuint vertex_format,
+ 				    GLuint primitive,
+ 				    GLuint min_nr )
+ {
+-   drm_radeon_cmd_header_t *cmd;
+    GLushort *retval;
++   int align_min_nr;
++   BATCH_LOCALS(&rmesa->radeon);
+ 
+    if (RADEON_DEBUG & DEBUG_IOCTL)
+-      fprintf(stderr, "%s %d\n", __FUNCTION__, min_nr);
++      fprintf(stderr, "%s %d prim %x\n", __FUNCTION__, min_nr, primitive);
+ 
+    assert((primitive & RADEON_CP_VC_CNTL_PRIM_WALK_IND));
+    
+-   radeonEmitState( rmesa );
++   radeonEmitState(&rmesa->radeon);
+    
+-   cmd = (drm_radeon_cmd_header_t *)radeonAllocCmdBuf( rmesa,
+-						       ELTS_BUFSZ(min_nr),
+-						       __FUNCTION__ );
++   rmesa->tcl.elt_cmd_start = rmesa->radeon.cmdbuf.cs->cdw;
++
++   /* round up min_nr to align the state */
++   align_min_nr = (min_nr + 1) & ~1;
++
+ #if RADEON_OLD_PACKETS
+-   cmd[0].i = 0;
+-   cmd[0].header.cmd_type = RADEON_CMD_PACKET3_CLIP;
+-   cmd[1].i = RADEON_CP_PACKET3_3D_RNDR_GEN_INDX_PRIM;
+-   cmd[2].i = rmesa->ioctl.vertex_offset;
+-   cmd[3].i = 0xffff;
+-   cmd[4].i = vertex_format;
+-   cmd[5].i = (primitive | 
+-	       RADEON_CP_VC_CNTL_PRIM_WALK_IND |
+-	       RADEON_CP_VC_CNTL_COLOR_ORDER_RGBA |
+-	       RADEON_CP_VC_CNTL_VTX_FMT_RADEON_MODE);
+-
+-   retval = (GLushort *)(cmd+6);
+-#else   
+-   cmd[0].i = 0;
+-   cmd[0].header.cmd_type = RADEON_CMD_PACKET3_CLIP;
+-   cmd[1].i = RADEON_CP_PACKET3_3D_DRAW_INDX;
+-   cmd[2].i = vertex_format;
+-   cmd[3].i = (primitive | 
+-	       RADEON_CP_VC_CNTL_PRIM_WALK_IND |
+-	       RADEON_CP_VC_CNTL_COLOR_ORDER_RGBA |
+-	       RADEON_CP_VC_CNTL_MAOS_ENABLE |
+-	       RADEON_CP_VC_CNTL_VTX_FMT_RADEON_MODE);
+-
+-   retval = (GLushort *)(cmd+4);
++   BEGIN_BATCH_NO_AUTOSTATE(2+ELTS_BUFSZ(align_min_nr)/4);
++   OUT_BATCH_PACKET3_CLIP(RADEON_CP_PACKET3_3D_RNDR_GEN_INDX_PRIM, 0);
++   if (!rmesa->radeon.radeonScreen->kernel_mm) {
++     OUT_BATCH_RELOC(rmesa->ioctl.vertex_offset, rmesa->ioctl.bo, rmesa->ioctl.vertex_offset, RADEON_GEM_DOMAIN_GTT, 0, 0);
++   } else {
++     OUT_BATCH(rmesa->ioctl.vertex_offset);
++   }
++   OUT_BATCH(0xffff);
++   OUT_BATCH(vertex_format);
++   OUT_BATCH(primitive | 
++	     RADEON_CP_VC_CNTL_PRIM_WALK_IND |
++	     RADEON_CP_VC_CNTL_COLOR_ORDER_RGBA |
++	     RADEON_CP_VC_CNTL_VTX_FMT_RADEON_MODE);
 +
-+void radeonEmitVec8(uint32_t *out, GLvoid * data, int stride, int count)
-+{
-+	int i;
++#else
++   BEGIN_BATCH_NO_AUTOSTATE(ELTS_BUFSZ(align_min_nr)/4);
++   OUT_BATCH_PACKET3_CLIP(RADEON_CP_PACKET3_DRAW_INDX, 0);
++   OUT_BATCH(vertex_format);
++   OUT_BATCH(primitive | 
++	     RADEON_CP_VC_CNTL_PRIM_WALK_IND |
++	     RADEON_CP_VC_CNTL_COLOR_ORDER_RGBA |
++	     RADEON_CP_VC_CNTL_MAOS_ENABLE |
++	     RADEON_CP_VC_CNTL_VTX_FMT_RADEON_MODE);
+ #endif
+ 
+-   if (RADEON_DEBUG & DEBUG_PRIMS)
+-      fprintf(stderr, "%s: header 0x%x vfmt 0x%x prim %x \n",
+-	      __FUNCTION__,
+-	      cmd[1].i, vertex_format, primitive);
+ 
+-   assert(!rmesa->dma.flush);
+-   rmesa->glCtx->Driver.NeedFlush |= FLUSH_STORED_VERTICES;
+-   rmesa->dma.flush = radeonFlushElts;
++   rmesa->tcl.elt_cmd_offset = rmesa->radeon.cmdbuf.cs->cdw;
++   rmesa->tcl.elt_used = min_nr;
+ 
+-   rmesa->store.elts_start = ((char *)cmd) - rmesa->store.cmd_buf;
++   retval = (GLushort *)(rmesa->radeon.cmdbuf.cs->packets + rmesa->tcl.elt_cmd_offset);
++   
++   if (RADEON_DEBUG & DEBUG_PRIMS)
++      fprintf(stderr, "%s: header prim %x \n",
++	      __FUNCTION__, primitive);
++
++   assert(!rmesa->radeon.dma.flush);
++   rmesa->radeon.glCtx->Driver.NeedFlush |= FLUSH_STORED_VERTICES;
++   rmesa->radeon.dma.flush = radeonFlushElts;
+ 
+    return retval;
+ }
+ 
+-
+-
+-void radeonEmitVertexAOS( radeonContextPtr rmesa,
++void radeonEmitVertexAOS( r100ContextPtr rmesa,
+ 			  GLuint vertex_size,
++			  struct radeon_bo *bo,
+ 			  GLuint offset )
+ {
+ #if RADEON_OLD_PACKETS
+-   rmesa->ioctl.vertex_size = vertex_size;
+    rmesa->ioctl.vertex_offset = offset;
++   rmesa->ioctl.bo = bo;
+ #else
+-   drm_radeon_cmd_header_t *cmd;
++   BATCH_LOCALS(&rmesa->radeon);
+ 
+    if (RADEON_DEBUG & (DEBUG_PRIMS|DEBUG_IOCTL))
+       fprintf(stderr, "%s:  vertex_size 0x%x offset 0x%x \n",
+ 	      __FUNCTION__, vertex_size, offset);
+ 
+-   cmd = (drm_radeon_cmd_header_t *)radeonAllocCmdBuf( rmesa, VERT_AOS_BUFSZ,
+-						  __FUNCTION__ );
++   BEGIN_BATCH(7);
++   OUT_BATCH_PACKET3(RADEON_CP_PACKET3_3D_LOAD_VBPNTR, 2);
++   OUT_BATCH(1);
++   OUT_BATCH(vertex_size | (vertex_size << 8));
++   OUT_BATCH_RELOC(offset, bo, offset, RADEON_GEM_DOMAIN_GTT, 0, 0);
++   END_BATCH();
+ 
+-   cmd[0].i = 0;
+-   cmd[0].header.cmd_type = RADEON_CMD_PACKET3;
+-   cmd[1].i = RADEON_CP_PACKET3_3D_LOAD_VBPNTR | (2 << 16);
+-   cmd[2].i = 1;
+-   cmd[3].i = vertex_size | (vertex_size << 8);
+-   cmd[4].i = offset;
+ #endif
+ }
+ 		       
+ 
+-void radeonEmitAOS( radeonContextPtr rmesa,
+-		    struct radeon_dma_region **component,
++void radeonEmitAOS( r100ContextPtr rmesa,
+ 		    GLuint nr,
+ 		    GLuint offset )
+ {
+ #if RADEON_OLD_PACKETS
+    assert( nr == 1 );
+-   assert( component[0]->aos_size == component[0]->aos_stride );
+-   rmesa->ioctl.vertex_size = component[0]->aos_size;
++   rmesa->ioctl.bo = rmesa->tcl.aos[0].bo;
+    rmesa->ioctl.vertex_offset = 
+-      (component[0]->aos_start + offset * component[0]->aos_stride * 4);
++     (rmesa->tcl.aos[0].offset + offset * rmesa->tcl.aos[0].stride * 4);
+ #else
+-   drm_radeon_cmd_header_t *cmd;
+-   int sz = AOS_BUFSZ(nr);
++   BATCH_LOCALS(&rmesa->radeon);
++   uint32_t voffset;
++   //   int sz = AOS_BUFSZ(nr);
++   int sz = 1 + (nr >> 1) * 3 + (nr & 1) * 2;
+    int i;
+-   int *tmp;
+ 
+    if (RADEON_DEBUG & DEBUG_IOCTL)
+       fprintf(stderr, "%s\n", __FUNCTION__);
+ 
+-
+-   cmd = (drm_radeon_cmd_header_t *)radeonAllocCmdBuf( rmesa, sz,
+-						  __FUNCTION__ );
+-   cmd[0].i = 0;
+-   cmd[0].header.cmd_type = RADEON_CMD_PACKET3;
+-   cmd[1].i = RADEON_CP_PACKET3_3D_LOAD_VBPNTR | (((sz / sizeof(int))-3) << 16);
+-   cmd[2].i = nr;
+-   tmp = &cmd[0].i;
+-   cmd += 3;
+-
+-   for (i = 0 ; i < nr ; i++) {
+-      if (i & 1) {
+-	 cmd[0].i |= ((component[i]->aos_stride << 24) | 
+-		      (component[i]->aos_size << 16));
+-	 cmd[2].i = (component[i]->aos_start + 
+-		     offset * component[i]->aos_stride * 4);
+-	 cmd += 3;
+-      }
+-      else {
+-	 cmd[0].i = ((component[i]->aos_stride << 8) | 
+-		     (component[i]->aos_size << 0));
+-	 cmd[1].i = (component[i]->aos_start + 
+-		     offset * component[i]->aos_stride * 4);
+-      }
+-   }
+-
+-   if (RADEON_DEBUG & DEBUG_VERTS) {
+-      fprintf(stderr, "%s:\n", __FUNCTION__);
+-      for (i = 0 ; i < sz ; i++)
+-	 fprintf(stderr, "   %d: %x\n", i, tmp[i]);
+-   }
+-#endif
+-}
+-
+-/* using already shifted color_fmt! */
+-void radeonEmitBlit( radeonContextPtr rmesa, /* FIXME: which drmMinor is required? */
+-		   GLuint color_fmt,
+-		   GLuint src_pitch,
+-		   GLuint src_offset,
+-		   GLuint dst_pitch,
+-		   GLuint dst_offset,
+-		   GLint srcx, GLint srcy,
+-		   GLint dstx, GLint dsty,
+-		   GLuint w, GLuint h )
+-{
+-   drm_radeon_cmd_header_t *cmd;
+-
+-   if (RADEON_DEBUG & DEBUG_IOCTL)
+-      fprintf(stderr, "%s src %x/%x %d,%d dst: %x/%x %d,%d sz: %dx%d\n",
+-	      __FUNCTION__, 
+-	      src_pitch, src_offset, srcx, srcy,
+-	      dst_pitch, dst_offset, dstx, dsty,
+-	      w, h);
+-
+-   assert( (src_pitch & 63) == 0 );
+-   assert( (dst_pitch & 63) == 0 );
+-   assert( (src_offset & 1023) == 0 ); 
+-   assert( (dst_offset & 1023) == 0 ); 
+-   assert( w < (1<<16) );
+-   assert( h < (1<<16) );
+-
+-   cmd = (drm_radeon_cmd_header_t *)radeonAllocCmdBuf( rmesa, 8 * sizeof(int),
+-						  __FUNCTION__ );
+-
+-
+-   cmd[0].i = 0;
+-   cmd[0].header.cmd_type = RADEON_CMD_PACKET3;
+-   cmd[1].i = RADEON_CP_PACKET3_CNTL_BITBLT_MULTI | (5 << 16);
+-   cmd[2].i = (RADEON_GMC_SRC_PITCH_OFFSET_CNTL |
+-	       RADEON_GMC_DST_PITCH_OFFSET_CNTL |
+-	       RADEON_GMC_BRUSH_NONE |
+-	       color_fmt |
+-	       RADEON_GMC_SRC_DATATYPE_COLOR |
+-	       RADEON_ROP3_S |
+-	       RADEON_DP_SRC_SOURCE_MEMORY |
+-	       RADEON_GMC_CLR_CMP_CNTL_DIS |
+-	       RADEON_GMC_WR_MSK_DIS );
+-
+-   cmd[3].i = ((src_pitch/64)<<22) | (src_offset >> 10);
+-   cmd[4].i = ((dst_pitch/64)<<22) | (dst_offset >> 10);
+-   cmd[5].i = (srcx << 16) | srcy;
+-   cmd[6].i = (dstx << 16) | dsty; /* dst */
+-   cmd[7].i = (w << 16) | h;
+-}
+-
+-
+-void radeonEmitWait( radeonContextPtr rmesa, GLuint flags )
+-{
+-   drm_radeon_cmd_header_t *cmd;
+-
+-   assert( !(flags & ~(RADEON_WAIT_2D|RADEON_WAIT_3D)) );
+-
+-   cmd = (drm_radeon_cmd_header_t *)radeonAllocCmdBuf( rmesa, 1 * sizeof(int),
+-					   __FUNCTION__ );
+-   cmd[0].i = 0;
+-   cmd[0].wait.cmd_type = RADEON_CMD_WAIT;
+-   cmd[0].wait.flags = flags;
+-}
+-
+-
+-static int radeonFlushCmdBufLocked( radeonContextPtr rmesa, 
+-				    const char * caller )
+-{
+-   int ret, i;
+-   drm_radeon_cmd_buffer_t cmd;
+-
+-   if (rmesa->lost_context)
+-      radeonBackUpAndEmitLostStateLocked(rmesa);
+-
+-   if (RADEON_DEBUG & DEBUG_IOCTL) {
+-      fprintf(stderr, "%s from %s\n", __FUNCTION__, caller); 
+-
+-      if (RADEON_DEBUG & DEBUG_VERBOSE) 
+-	 for (i = 0 ; i < rmesa->store.cmd_used ; i += 4 )
+-	    fprintf(stderr, "%d: %x\n", i/4, 
+-		    *(int *)(&rmesa->store.cmd_buf[i]));
+-   }
+-
+-   if (RADEON_DEBUG & DEBUG_DMA)
+-      fprintf(stderr, "%s: Releasing %d buffers\n", __FUNCTION__,
+-	      rmesa->dma.nr_released_bufs);
+-
+-
+-   if (RADEON_DEBUG & DEBUG_SANITY) {
+-      if (rmesa->state.scissor.enabled) 
+-	 ret = radeonSanityCmdBuffer( rmesa, 
+-				      rmesa->state.scissor.numClipRects,
+-				      rmesa->state.scissor.pClipRects);
+-      else
+-	 ret = radeonSanityCmdBuffer( rmesa, 
+-				      rmesa->numClipRects,
+-				      rmesa->pClipRects);
+-      if (ret) {
+-	 fprintf(stderr, "drmSanityCommandWrite: %d\n", ret);	 
+-	 goto out;
++   BEGIN_BATCH(sz+2+(nr * 2));
++   OUT_BATCH_PACKET3(RADEON_CP_PACKET3_3D_LOAD_VBPNTR, sz - 1);
++   OUT_BATCH(nr);
++
++   if (!rmesa->radeon.radeonScreen->kernel_mm) {
++      for (i = 0; i + 1 < nr; i += 2) {
++	 OUT_BATCH((rmesa->tcl.aos[i].components << 0) |
++		   (rmesa->tcl.aos[i].stride << 8) |
++		   (rmesa->tcl.aos[i + 1].components << 16) |
++		   (rmesa->tcl.aos[i + 1].stride << 24));
++			
++	 voffset =  rmesa->tcl.aos[i + 0].offset +
++	    offset * 4 * rmesa->tcl.aos[i + 0].stride;
++	 OUT_BATCH_RELOC(voffset,
++			 rmesa->tcl.aos[i].bo,
++			 voffset,
++			 RADEON_GEM_DOMAIN_GTT,
++			 0, 0);
++	 voffset =  rmesa->tcl.aos[i + 1].offset +
++	    offset * 4 * rmesa->tcl.aos[i + 1].stride;
++	 OUT_BATCH_RELOC(voffset,
++			 rmesa->tcl.aos[i+1].bo,
++			 voffset,
++			 RADEON_GEM_DOMAIN_GTT,
++			 0, 0);
+       }
+-   }
+-
+-
+-   cmd.bufsz = rmesa->store.cmd_used;
+-   cmd.buf = rmesa->store.cmd_buf;
+-
+-   if (rmesa->state.scissor.enabled) {
+-      cmd.nbox = rmesa->state.scissor.numClipRects;
+-      cmd.boxes = rmesa->state.scissor.pClipRects;
+-   } else {
+-      cmd.nbox = rmesa->numClipRects;
+-      cmd.boxes = rmesa->pClipRects;
+-   }
+-
+-   ret = drmCommandWrite( rmesa->dri.fd,
+-			  DRM_RADEON_CMDBUF,
+-			  &cmd, sizeof(cmd) );
+-
+-   if (ret)
+-      fprintf(stderr, "drmCommandWrite: %d\n", ret);
+-
+-   if (RADEON_DEBUG & DEBUG_SYNC) {
+-      fprintf(stderr, "\nSyncing in %s\n\n", __FUNCTION__);
+-      radeonWaitForIdleLocked( rmesa );
+-   }
+-
+- out:
+-   rmesa->store.primnr = 0;
+-   rmesa->store.statenr = 0;
+-   rmesa->store.cmd_used = 0;
+-   rmesa->dma.nr_released_bufs = 0;
+-   rmesa->save_on_next_emit = 1;
+-
+-   return ret;
+-}
+-
+-
+-/* Note: does not emit any commands to avoid recursion on
+- * radeonAllocCmdBuf.
+- */
+-void radeonFlushCmdBuf( radeonContextPtr rmesa, const char *caller )
+-{
+-   int ret;
+-
+-	      
+-   LOCK_HARDWARE( rmesa );
+-
+-   ret = radeonFlushCmdBufLocked( rmesa, caller );
+-
+-   UNLOCK_HARDWARE( rmesa );
+-
+-   if (ret) {
+-      fprintf(stderr, "drm_radeon_cmd_buffer_t: %d (exiting)\n", ret);
+-      exit(ret);
+-   }
+-}
+-
+-/* =============================================================
+- * Hardware vertex buffer handling
+- */
+-
+-
+-void radeonRefillCurrentDmaRegion( radeonContextPtr rmesa )
+-{
+-   struct radeon_dma_buffer *dmabuf;
+-   int fd = rmesa->dri.fd;
+-   int index = 0;
+-   int size = 0;
+-   drmDMAReq dma;
+-   int ret;
+-
+-   if (RADEON_DEBUG & (DEBUG_IOCTL|DEBUG_DMA))
+-      fprintf(stderr, "%s\n", __FUNCTION__);  
+-
+-   if (rmesa->dma.flush) {
+-      rmesa->dma.flush( rmesa );
+-   }
+-
+-   if (rmesa->dma.current.buf)
+-      radeonReleaseDmaRegion( rmesa, &rmesa->dma.current, __FUNCTION__ );
+-
+-   if (rmesa->dma.nr_released_bufs > 4)
+-      radeonFlushCmdBuf( rmesa, __FUNCTION__ );
+-
+-   dma.context = rmesa->dri.hwContext;
+-   dma.send_count = 0;
+-   dma.send_list = NULL;
+-   dma.send_sizes = NULL;
+-   dma.flags = 0;
+-   dma.request_count = 1;
+-   dma.request_size = RADEON_BUFFER_SIZE;
+-   dma.request_list = &index;
+-   dma.request_sizes = &size;
+-   dma.granted_count = 0;
+-
+-   LOCK_HARDWARE(rmesa);	/* no need to validate */
+-
+-   ret = drmDMA( fd, &dma );
+       
+-   if (ret != 0) {
+-      /* Free some up this way?
+-       */
+-      if (rmesa->dma.nr_released_bufs) {
+-	 radeonFlushCmdBufLocked( rmesa, __FUNCTION__ );
++      if (nr & 1) {
++	 OUT_BATCH((rmesa->tcl.aos[nr - 1].components << 0) |
++		   (rmesa->tcl.aos[nr - 1].stride << 8));
++	 voffset =  rmesa->tcl.aos[nr - 1].offset +
++	    offset * 4 * rmesa->tcl.aos[nr - 1].stride;
++	 OUT_BATCH_RELOC(voffset,
++			 rmesa->tcl.aos[nr - 1].bo,
++			 voffset,
++			 RADEON_GEM_DOMAIN_GTT,
++			 0, 0);
+       }
+-      
+-      if (RADEON_DEBUG & DEBUG_DMA)
+-	 fprintf(stderr, "Waiting for buffers\n");
+-
+-      radeonWaitForIdleLocked( rmesa );
+-      ret = drmDMA( fd, &dma );
+-
+-      if ( ret != 0 ) {
+-	 UNLOCK_HARDWARE( rmesa );
+-	 fprintf( stderr, "Error: Could not get dma buffer... exiting\n" );
+-	 exit( -1 );
++   } else {
++      for (i = 0; i + 1 < nr; i += 2) {
++	 OUT_BATCH((rmesa->tcl.aos[i].components << 0) |
++		   (rmesa->tcl.aos[i].stride << 8) |
++		   (rmesa->tcl.aos[i + 1].components << 16) |
++		   (rmesa->tcl.aos[i + 1].stride << 24));
++	 
++	 voffset =  rmesa->tcl.aos[i + 0].offset +
++	    offset * 4 * rmesa->tcl.aos[i + 0].stride;
++	 OUT_BATCH(voffset);
++	 voffset =  rmesa->tcl.aos[i + 1].offset +
++	    offset * 4 * rmesa->tcl.aos[i + 1].stride;
++	 OUT_BATCH(voffset);
+       }
+-   }
+-
+-   UNLOCK_HARDWARE(rmesa);
+-
+-   if (RADEON_DEBUG & DEBUG_DMA)
+-      fprintf(stderr, "Allocated buffer %d\n", index);
+-
+-   dmabuf = CALLOC_STRUCT( radeon_dma_buffer );
+-   dmabuf->buf = &rmesa->radeonScreen->buffers->list[index];
+-   dmabuf->refcount = 1;
+-
+-   rmesa->dma.current.buf = dmabuf;
+-   rmesa->dma.current.address = dmabuf->buf->address;
+-   rmesa->dma.current.end = dmabuf->buf->total;
+-   rmesa->dma.current.start = 0;
+-   rmesa->dma.current.ptr = 0;
+-
+-   rmesa->c_vertexBuffers++;
+-}
+-
+-void radeonReleaseDmaRegion( radeonContextPtr rmesa,
+-			     struct radeon_dma_region *region,
+-			     const char *caller )
+-{
+-   if (RADEON_DEBUG & DEBUG_IOCTL)
+-      fprintf(stderr, "%s from %s\n", __FUNCTION__, caller); 
+-   
+-   if (!region->buf)
+-      return;
+-
+-   if (rmesa->dma.flush)
+-      rmesa->dma.flush( rmesa );
+-
+-   if (--region->buf->refcount == 0) {
+-      drm_radeon_cmd_header_t *cmd;
+-
+-      if (RADEON_DEBUG & (DEBUG_IOCTL|DEBUG_DMA))
+-	 fprintf(stderr, "%s -- DISCARD BUF %d\n", __FUNCTION__,
+-		 region->buf->buf->idx);  
+       
+-      cmd = (drm_radeon_cmd_header_t *)radeonAllocCmdBuf( rmesa, sizeof(*cmd), 
+-						     __FUNCTION__ );
+-      cmd->dma.cmd_type = RADEON_CMD_DMA_DISCARD;
+-      cmd->dma.buf_idx = region->buf->buf->idx;
+-      FREE(region->buf);
+-      rmesa->dma.nr_released_bufs++;
+-   }
+-
+-   region->buf = NULL;
+-   region->start = 0;
+-}
+-
+-/* Allocates a region from rmesa->dma.current.  If there isn't enough
+- * space in current, grab a new buffer (and discard what was left of current)
+- */
+-void radeonAllocDmaRegion( radeonContextPtr rmesa, 
+-			   struct radeon_dma_region *region,
+-			   int bytes,
+-			   int alignment )
+-{
+-   if (RADEON_DEBUG & DEBUG_IOCTL)
+-      fprintf(stderr, "%s %d\n", __FUNCTION__, bytes);
+-
+-   if (rmesa->dma.flush)
+-      rmesa->dma.flush( rmesa );
+-
+-   if (region->buf)
+-      radeonReleaseDmaRegion( rmesa, region, __FUNCTION__ );
+-
+-   alignment--;
+-   rmesa->dma.current.start = rmesa->dma.current.ptr = 
+-      (rmesa->dma.current.ptr + alignment) & ~alignment;
+-
+-   if ( rmesa->dma.current.ptr + bytes > rmesa->dma.current.end ) 
+-      radeonRefillCurrentDmaRegion( rmesa );
+-
+-   region->start = rmesa->dma.current.start;
+-   region->ptr = rmesa->dma.current.start;
+-   region->end = rmesa->dma.current.start + bytes;
+-   region->address = rmesa->dma.current.address;
+-   region->buf = rmesa->dma.current.buf;
+-   region->buf->refcount++;
+-
+-   rmesa->dma.current.ptr += bytes; /* bug - if alignment > 7 */
+-   rmesa->dma.current.start = 
+-      rmesa->dma.current.ptr = (rmesa->dma.current.ptr + 0x7) & ~0x7;  
+-}
+-
+-/* ================================================================
+- * SwapBuffers with client-side throttling
+- */
+-
+-static uint32_t radeonGetLastFrame (radeonContextPtr rmesa) 
+-{
+-   drm_radeon_getparam_t gp;
+-   int ret;
+-   uint32_t frame;
+-
+-   gp.param = RADEON_PARAM_LAST_FRAME;
+-   gp.value = (int *)&frame;
+-   ret = drmCommandWriteRead( rmesa->dri.fd, DRM_RADEON_GETPARAM,
+-			      &gp, sizeof(gp) );
+-
+-   if ( ret ) {
+-      fprintf( stderr, "%s: drm_radeon_getparam_t: %d\n", __FUNCTION__, ret );
+-      exit(1);
+-   }
+-
+-   return frame;
+-}
+-
+-static void radeonEmitIrqLocked( radeonContextPtr rmesa )
+-{
+-   drm_radeon_irq_emit_t ie;
+-   int ret;
+-
+-   ie.irq_seq = &rmesa->iw.irq_seq;
+-   ret = drmCommandWriteRead( rmesa->dri.fd, DRM_RADEON_IRQ_EMIT, 
+-			      &ie, sizeof(ie) );
+-   if ( ret ) {
+-      fprintf( stderr, "%s: drm_radeon_irq_emit_t: %d\n", __FUNCTION__, ret );
+-      exit(1);
+-   }
+-}
+-
+-
+-static void radeonWaitIrq( radeonContextPtr rmesa )
+-{
+-   int ret;
+-
+-   do {
+-      ret = drmCommandWrite( rmesa->dri.fd, DRM_RADEON_IRQ_WAIT,
+-			     &rmesa->iw, sizeof(rmesa->iw) );
+-   } while (ret && (errno == EINTR || errno == EBUSY));
+-
+-   if ( ret ) {
+-      fprintf( stderr, "%s: drmRadeonIrqWait: %d\n", __FUNCTION__, ret );
+-      exit(1);
+-   }
+-}
+-
+-
+-static void radeonWaitForFrameCompletion( radeonContextPtr rmesa )
+-{
+-   drm_radeon_sarea_t *sarea = rmesa->sarea;
+-
+-   if (rmesa->do_irqs) {
+-      if (radeonGetLastFrame(rmesa) < sarea->last_frame) {
+-	 if (!rmesa->irqsEmitted) {
+-	    while (radeonGetLastFrame (rmesa) < sarea->last_frame)
+-	       ;
+-	 }
+-	 else {
+-	    UNLOCK_HARDWARE( rmesa ); 
+-	    radeonWaitIrq( rmesa );	
+-	    LOCK_HARDWARE( rmesa ); 
+-	 }
+-	 rmesa->irqsEmitted = 10;
+-      }
+-
+-      if (rmesa->irqsEmitted) {
+-	 radeonEmitIrqLocked( rmesa );
+-	 rmesa->irqsEmitted--;
++      if (nr & 1) {
++	 OUT_BATCH((rmesa->tcl.aos[nr - 1].components << 0) |
++		   (rmesa->tcl.aos[nr - 1].stride << 8));
++	 voffset =  rmesa->tcl.aos[nr - 1].offset +
++	    offset * 4 * rmesa->tcl.aos[nr - 1].stride;
++	 OUT_BATCH(voffset);
+       }
+-   } 
+-   else {
+-      while (radeonGetLastFrame (rmesa) < sarea->last_frame) {
+-	 UNLOCK_HARDWARE( rmesa ); 
+-	 if (rmesa->do_usleeps) 
+-	    DO_USLEEP( 1 );
+-	 LOCK_HARDWARE( rmesa ); 
+-      }
+-   }
+-}
+-
+-/* Copy the back color buffer to the front color buffer.
+- */
+-void radeonCopyBuffer( __DRIdrawablePrivate *dPriv,
+-		       const drm_clip_rect_t	  *rect)
+-{
+-   radeonContextPtr rmesa;
+-   GLint nbox, i, ret;
+-   GLboolean   missed_target;
+-   int64_t ust;
+-   __DRIscreenPrivate *psp;
+-
+-   assert(dPriv);
+-   assert(dPriv->driContextPriv);
+-   assert(dPriv->driContextPriv->driverPrivate);
+-
+-   rmesa = (radeonContextPtr) dPriv->driContextPriv->driverPrivate;
+-
+-   if ( RADEON_DEBUG & DEBUG_IOCTL ) {
+-      fprintf( stderr, "\n%s( %p )\n\n", __FUNCTION__, (void *) rmesa->glCtx );
+-   }
+-
+-   RADEON_FIREVERTICES( rmesa );
+-   LOCK_HARDWARE( rmesa );
+-
+-   /* Throttle the frame rate -- only allow one pending swap buffers
+-    * request at a time.
+-    */
+-   radeonWaitForFrameCompletion( rmesa );
+-   if (!rect)
+-   {
+-       UNLOCK_HARDWARE( rmesa );
+-       driWaitForVBlank( dPriv, & missed_target );
+-       LOCK_HARDWARE( rmesa );
+-   }
+-
+-   nbox = dPriv->numClipRects; /* must be in locked region */
+-
+-   for ( i = 0 ; i < nbox ; ) {
+-      GLint nr = MIN2( i + RADEON_NR_SAREA_CLIPRECTS , nbox );
+-      drm_clip_rect_t *box = dPriv->pClipRects;
+-      drm_clip_rect_t *b = rmesa->sarea->boxes;
+-      GLint n = 0;
+-
+-      for ( ; i < nr ; i++ ) {
+-
+-	  *b = box[i];
+-
+-	  if (rect)
+-	  {
+-	      if (rect->x1 > b->x1)
+-		  b->x1 = rect->x1;
+-	      if (rect->y1 > b->y1)
+-		  b->y1 = rect->y1;
+-	      if (rect->x2 < b->x2)
+-		  b->x2 = rect->x2;
+-	      if (rect->y2 < b->y2)
+-		  b->y2 = rect->y2;
+-
+-	      if (b->x1 >= b->x2 || b->y1 >= b->y2)
+-		  continue;
+-	  }
+-
+-	  b++;
+-	  n++;
++      for (i = 0; i + 1 < nr; i += 2) {
++	 voffset =  rmesa->tcl.aos[i + 0].offset +
++	    offset * 4 * rmesa->tcl.aos[i + 0].stride;
++	 radeon_cs_write_reloc(rmesa->radeon.cmdbuf.cs,
++			       rmesa->tcl.aos[i+0].bo,
++			       RADEON_GEM_DOMAIN_GTT,
++			       0, 0);
++	 voffset =  rmesa->tcl.aos[i + 1].offset +
++	    offset * 4 * rmesa->tcl.aos[i + 1].stride;
++	 radeon_cs_write_reloc(rmesa->radeon.cmdbuf.cs,
++			       rmesa->tcl.aos[i+1].bo,
++			       RADEON_GEM_DOMAIN_GTT,
++			       0, 0);
+       }
+-      rmesa->sarea->nbox = n;
+-
+-      if (!n)
+-	 continue;
+-
+-      ret = drmCommandNone( rmesa->dri.fd, DRM_RADEON_SWAP );
+-
+-      if ( ret ) {
+-	 fprintf( stderr, "DRM_RADEON_SWAP_BUFFERS: return = %d\n", ret );
+-	 UNLOCK_HARDWARE( rmesa );
+-	 exit( 1 );
++      if (nr & 1) {
++	 voffset =  rmesa->tcl.aos[nr - 1].offset +
++	    offset * 4 * rmesa->tcl.aos[nr - 1].stride;
++	 radeon_cs_write_reloc(rmesa->radeon.cmdbuf.cs,
++			       rmesa->tcl.aos[nr-1].bo,
++			       RADEON_GEM_DOMAIN_GTT,
++			       0, 0);
+       }
+    }
++   END_BATCH();
+ 
+-   UNLOCK_HARDWARE( rmesa );
+-   if (!rect)
+-   {
+-       psp = dPriv->driScreenPriv;
+-       rmesa->swap_count++;
+-       (*psp->systemTime->getUST)( & ust );
+-       if ( missed_target ) {
+-	   rmesa->swap_missed_count++;
+-	   rmesa->swap_missed_ust = ust - rmesa->swap_ust;
+-       }
+-
+-       rmesa->swap_ust = ust;
+-       rmesa->hw.all_dirty = GL_TRUE;
+-   }
+-}
+-
+-void radeonPageFlip( __DRIdrawablePrivate *dPriv )
+-{
+-   radeonContextPtr rmesa;
+-   GLint ret;
+-   GLboolean   missed_target;
+-   __DRIscreenPrivate *psp;
+-
+-   assert(dPriv);
+-   assert(dPriv->driContextPriv);
+-   assert(dPriv->driContextPriv->driverPrivate);
+-
+-   rmesa = (radeonContextPtr) dPriv->driContextPriv->driverPrivate;
+-   psp = dPriv->driScreenPriv;
+-
+-   if ( RADEON_DEBUG & DEBUG_IOCTL ) {
+-      fprintf(stderr, "%s: pfCurrentPage: %d\n", __FUNCTION__,
+-	      rmesa->sarea->pfCurrentPage);
+-   }
+-
+-   RADEON_FIREVERTICES( rmesa );
+-   LOCK_HARDWARE( rmesa );
+-
+-   /* Need to do this for the perf box placement:
+-    */
+-   if (dPriv->numClipRects)
+-   {
+-      drm_clip_rect_t *box = dPriv->pClipRects;
+-      drm_clip_rect_t *b = rmesa->sarea->boxes;
+-      b[0] = box[0];
+-      rmesa->sarea->nbox = 1;
+-   }
+-
+-   /* Throttle the frame rate -- only allow a few pending swap buffers
+-    * request at a time.
+-    */
+-   radeonWaitForFrameCompletion( rmesa );
+-   UNLOCK_HARDWARE( rmesa );
+-   driWaitForVBlank( dPriv, & missed_target );
+-   if ( missed_target ) {
+-      rmesa->swap_missed_count++;
+-      (void) (*psp->systemTime->getUST)( & rmesa->swap_missed_ust );
+-   }
+-   LOCK_HARDWARE( rmesa );
+-
+-   ret = drmCommandNone( rmesa->dri.fd, DRM_RADEON_FLIP );
+-
+-   UNLOCK_HARDWARE( rmesa );
+-
+-   if ( ret ) {
+-      fprintf( stderr, "DRM_RADEON_FLIP: return = %d\n", ret );
+-      exit( 1 );
+-   }
+-
+-   rmesa->swap_count++;
+-   (void) (*psp->systemTime->getUST)( & rmesa->swap_ust );
+-
+-   /* Get ready for drawing next frame.  Update the renderbuffers'
+-    * flippedOffset/Pitch fields so we draw into the right place.
+-    */
+-   driFlipRenderbuffers(rmesa->glCtx->WinSysDrawBuffer,
+-                        rmesa->sarea->pfCurrentPage);
+-
+-   radeonUpdateDrawBuffer(rmesa->glCtx);
++#endif
+ }
+ 
+-
+ /* ================================================================
+  * Buffer clear
+  */
+@@ -1028,9 +407,9 @@ void radeonPageFlip( __DRIdrawablePrivate *dPriv )
+ 
+ static void radeonClear( GLcontext *ctx, GLbitfield mask )
+ {
+-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+-   __DRIdrawablePrivate *dPriv = rmesa->dri.drawable;
+-   drm_radeon_sarea_t *sarea = rmesa->sarea;
++   r100ContextPtr rmesa = R100_CONTEXT(ctx);
++   __DRIdrawablePrivate *dPriv = rmesa->radeon.dri.drawable;
++   drm_radeon_sarea_t *sarea = rmesa->radeon.sarea;
+    uint32_t clear;
+    GLuint flags = 0;
+    GLuint color_mask = 0;
+@@ -1042,8 +421,8 @@ static void radeonClear( GLcontext *ctx, GLbitfield mask )
+    }
+ 
+    {
+-      LOCK_HARDWARE( rmesa );
+-      UNLOCK_HARDWARE( rmesa );
++      LOCK_HARDWARE( &rmesa->radeon );
++      UNLOCK_HARDWARE( &rmesa->radeon );
+       if ( dPriv->numClipRects == 0 ) 
+ 	 return;
+    }
+@@ -1067,7 +446,7 @@ static void radeonClear( GLcontext *ctx, GLbitfield mask )
+       mask &= ~BUFFER_BIT_DEPTH;
+    }
+ 
+-   if ( (mask & BUFFER_BIT_STENCIL) && rmesa->state.stencil.hwBuffer ) {
++   if ( (mask & BUFFER_BIT_STENCIL) && rmesa->radeon.state.stencil.hwBuffer ) {
+       flags |= RADEON_STENCIL;
+       mask &= ~BUFFER_BIT_STENCIL;
+    }
+@@ -1083,16 +462,16 @@ static void radeonClear( GLcontext *ctx, GLbitfield mask )
+ 
+    if (rmesa->using_hyperz) {
+       flags |= RADEON_USE_COMP_ZBUF;
+-/*      if (rmesa->radeonScreen->chipset & RADEON_CHIPSET_TCL) 
++/*      if (rmesa->radeon.radeonScreen->chipset & RADEON_CHIPSET_TCL) 
+          flags |= RADEON_USE_HIERZ; */
+-      if (!(rmesa->state.stencil.hwBuffer) ||
++      if (!(rmesa->radeon.state.stencil.hwBuffer) ||
+ 	 ((flags & RADEON_DEPTH) && (flags & RADEON_STENCIL) &&
+-	    ((rmesa->state.stencil.clear & RADEON_STENCIL_WRITE_MASK) == RADEON_STENCIL_WRITE_MASK))) {
++	    ((rmesa->radeon.state.stencil.clear & RADEON_STENCIL_WRITE_MASK) == RADEON_STENCIL_WRITE_MASK))) {
+ 	  flags |= RADEON_CLEAR_FASTZ;
+       }
+    }
+ 
+-   LOCK_HARDWARE( rmesa );
++   LOCK_HARDWARE( &rmesa->radeon );
+ 
+    /* compute region after locking: */
+    cx = ctx->DrawBuffer->_Xmin;
+@@ -1112,7 +491,7 @@ static void radeonClear( GLcontext *ctx, GLbitfield mask )
+ 
+       gp.param = RADEON_PARAM_LAST_CLEAR;
+       gp.value = (int *)&clear;
+-      ret = drmCommandWriteRead( rmesa->dri.fd,
++      ret = drmCommandWriteRead( rmesa->radeon.dri.fd,
+ 				 DRM_RADEON_GETPARAM, &gp, sizeof(gp) );
+ 
+       if ( ret ) {
+@@ -1124,20 +503,20 @@ static void radeonClear( GLcontext *ctx, GLbitfield mask )
+ 	 break;
+       }
+ 
+-      if ( rmesa->do_usleeps ) {
+-	 UNLOCK_HARDWARE( rmesa );
++      if ( rmesa->radeon.do_usleeps ) {
++	 UNLOCK_HARDWARE( &rmesa->radeon );
+ 	 DO_USLEEP( 1 );
+-	 LOCK_HARDWARE( rmesa );
++	 LOCK_HARDWARE( &rmesa->radeon );
+       }
+    }
+ 
+    /* Send current state to the hardware */
+-   radeonFlushCmdBufLocked( rmesa, __FUNCTION__ );
++   rcommonFlushCmdBufLocked( &rmesa->radeon, __FUNCTION__ );
+ 
+    for ( i = 0 ; i < dPriv->numClipRects ; ) {
+       GLint nr = MIN2( i + RADEON_NR_SAREA_CLIPRECTS, dPriv->numClipRects );
+       drm_clip_rect_t *box = dPriv->pClipRects;
+-      drm_clip_rect_t *b = rmesa->sarea->boxes;
++      drm_clip_rect_t *b = rmesa->radeon.sarea->boxes;
+       drm_radeon_clear_t clear;
+       drm_radeon_clear_rect_t depth_boxes[RADEON_NR_SAREA_CLIPRECTS];
+       GLint n = 0;
+@@ -1172,106 +551,40 @@ static void radeonClear( GLcontext *ctx, GLbitfield mask )
+ 	 }
+       }
+ 
+-      rmesa->sarea->nbox = n;
++      rmesa->radeon.sarea->nbox = n;
+ 
+       clear.flags       = flags;
+-      clear.clear_color = rmesa->state.color.clear;
+-      clear.clear_depth = rmesa->state.depth.clear;
++      clear.clear_color = rmesa->radeon.state.color.clear;
++      clear.clear_depth = rmesa->radeon.state.depth.clear;
+       clear.color_mask  = rmesa->hw.msk.cmd[MSK_RB3D_PLANEMASK];
+-      clear.depth_mask  = rmesa->state.stencil.clear;
++      clear.depth_mask  = rmesa->radeon.state.stencil.clear;
+       clear.depth_boxes = depth_boxes;
+ 
+       n--;
+-      b = rmesa->sarea->boxes;
++      b = rmesa->radeon.sarea->boxes;
+       for ( ; n >= 0 ; n-- ) {
+ 	 depth_boxes[n].f[CLEAR_X1] = (float)b[n].x1;
+ 	 depth_boxes[n].f[CLEAR_Y1] = (float)b[n].y1;
+ 	 depth_boxes[n].f[CLEAR_X2] = (float)b[n].x2;
+ 	 depth_boxes[n].f[CLEAR_Y2] = (float)b[n].y2;
+ 	 depth_boxes[n].f[CLEAR_DEPTH] = 
+-	    (float)rmesa->state.depth.clear;
++	    (float)rmesa->radeon.state.depth.clear;
+       }
+ 
+-      ret = drmCommandWrite( rmesa->dri.fd, DRM_RADEON_CLEAR,
++      ret = drmCommandWrite( rmesa->radeon.dri.fd, DRM_RADEON_CLEAR,
+ 			     &clear, sizeof(drm_radeon_clear_t));
+ 
+       if ( ret ) {
+-	 UNLOCK_HARDWARE( rmesa );
++	 UNLOCK_HARDWARE( &rmesa->radeon );
+ 	 fprintf( stderr, "DRM_RADEON_CLEAR: return = %d\n", ret );
+ 	 exit( 1 );
+       }
+    }
+ 
+-   UNLOCK_HARDWARE( rmesa );
+-   rmesa->hw.all_dirty = GL_TRUE;
++   UNLOCK_HARDWARE( &rmesa->radeon );
++   rmesa->radeon.hw.all_dirty = GL_TRUE;
+ }
+ 
+-
+-void radeonWaitForIdleLocked( radeonContextPtr rmesa )
+-{
+-    int fd = rmesa->dri.fd;
+-    int to = 0;
+-    int ret, i = 0;
+-
+-    rmesa->c_drawWaits++;
+-
+-    do {
+-        do {
+-            ret = drmCommandNone( fd, DRM_RADEON_CP_IDLE);
+-        } while ( ret && errno == EBUSY && i++ < RADEON_IDLE_RETRY );
+-    } while ( ( ret == -EBUSY ) && ( to++ < RADEON_TIMEOUT ) );
+-
+-    if ( ret < 0 ) {
+-	UNLOCK_HARDWARE( rmesa );
+-	fprintf( stderr, "Error: Radeon timed out... exiting\n" );
+-	exit( -1 );
+-    }
+-}
+-
+-
+-static void radeonWaitForIdle( radeonContextPtr rmesa )
+-{
+-   LOCK_HARDWARE(rmesa);
+-   radeonWaitForIdleLocked( rmesa );
+-   UNLOCK_HARDWARE(rmesa);
+-}
+-
+-
+-void radeonFlush( GLcontext *ctx )
+-{
+-   radeonContextPtr rmesa = RADEON_CONTEXT( ctx );
+-
+-   if (RADEON_DEBUG & DEBUG_IOCTL)
+-      fprintf(stderr, "%s\n", __FUNCTION__);
+-
+-   if (rmesa->dma.flush)
+-      rmesa->dma.flush( rmesa );
+-
+-   radeonEmitState( rmesa );
+-   
+-   if (rmesa->store.cmd_used)
+-      radeonFlushCmdBuf( rmesa, __FUNCTION__ );
+-}
+-
+-/* Make sure all commands have been sent to the hardware and have
+- * completed processing.
+- */
+-void radeonFinish( GLcontext *ctx )
+-{
+-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+-   radeonFlush( ctx );
+-
+-   if (rmesa->do_irqs) {
+-      LOCK_HARDWARE( rmesa );
+-      radeonEmitIrqLocked( rmesa );
+-      UNLOCK_HARDWARE( rmesa );
+-      radeonWaitIrq( rmesa );
+-   }
+-   else
+-      radeonWaitForIdle( rmesa );
+-}
+-
+-
+ void radeonInitIoctlFuncs( GLcontext *ctx )
+ {
+     ctx->Driver.Clear = radeonClear;
+diff --git a/src/mesa/drivers/dri/radeon/radeon_ioctl.h b/src/mesa/drivers/dri/radeon/radeon_ioctl.h
+index 4e3a44d..18805d4 100644
+--- a/src/mesa/drivers/dri/radeon/radeon_ioctl.h
++++ b/src/mesa/drivers/dri/radeon/radeon_ioctl.h
+@@ -38,31 +38,32 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ #include "main/simple_list.h"
+ #include "radeon_lock.h"
++#include "radeon_bocs_wrapper.h"
+ 
+-
+-extern void radeonEmitState( radeonContextPtr rmesa );
+-extern void radeonEmitVertexAOS( radeonContextPtr rmesa,
++extern void radeonEmitVertexAOS( r100ContextPtr rmesa,
+ 				 GLuint vertex_size,
++				 struct radeon_bo *bo,
+ 				 GLuint offset );
+ 
+-extern void radeonEmitVbufPrim( radeonContextPtr rmesa,
++extern void radeonEmitVbufPrim( r100ContextPtr rmesa,
+ 				GLuint vertex_format,
+ 				GLuint primitive,
+ 				GLuint vertex_nr );
+ 
+-extern void radeonFlushElts( radeonContextPtr rmesa );
++extern void radeonFlushElts( GLcontext *ctx );
++			    
+ 
+-extern GLushort *radeonAllocEltsOpenEnded( radeonContextPtr rmesa,
++extern GLushort *radeonAllocEltsOpenEnded( r100ContextPtr rmesa,
+ 					   GLuint vertex_format,
+ 					   GLuint primitive,
+ 					   GLuint min_nr );
+ 
+-extern void radeonEmitAOS( radeonContextPtr rmesa,
+-			   struct radeon_dma_region **regions,
++
++extern void radeonEmitAOS( r100ContextPtr rmesa,
+ 			   GLuint n,
+ 			   GLuint offset );
+ 
+-extern void radeonEmitBlit( radeonContextPtr rmesa,
++extern void radeonEmitBlit( r100ContextPtr rmesa,
+ 			    GLuint color_fmt,
+ 			    GLuint src_pitch,
+ 			    GLuint src_offset,
+@@ -72,30 +73,15 @@ extern void radeonEmitBlit( radeonContextPtr rmesa,
+ 			    GLint dstx, GLint dsty,
+ 			    GLuint w, GLuint h );
+ 
+-extern void radeonEmitWait( radeonContextPtr rmesa, GLuint flags );
+-
+-extern void radeonFlushCmdBuf( radeonContextPtr rmesa, const char * );
+-extern void radeonRefillCurrentDmaRegion( radeonContextPtr rmesa );
++extern void radeonEmitWait( r100ContextPtr rmesa, GLuint flags );
+ 
+-extern void radeonAllocDmaRegion( radeonContextPtr rmesa,
+-				  struct radeon_dma_region *region,
+-				  int bytes, 
+-				  int alignment );
++extern void radeonFlushCmdBuf( r100ContextPtr rmesa, const char * );
+ 
+-extern void radeonReleaseDmaRegion( radeonContextPtr rmesa,
+-				    struct radeon_dma_region *region,
+-				    const char *caller );
+-
+-extern void radeonCopyBuffer( __DRIdrawablePrivate *drawable,
+-			      const drm_clip_rect_t	 *rect);
+-extern void radeonPageFlip( __DRIdrawablePrivate *drawable );
+ extern void radeonFlush( GLcontext *ctx );
+ extern void radeonFinish( GLcontext *ctx );
+-extern void radeonWaitForIdleLocked( radeonContextPtr rmesa );
+-extern void radeonWaitForVBlank( radeonContextPtr rmesa );
+ extern void radeonInitIoctlFuncs( GLcontext *ctx );
+-extern void radeonGetAllParams( radeonContextPtr rmesa );
+-extern void radeonSetUpAtomList( radeonContextPtr rmesa );
++extern void radeonGetAllParams( r100ContextPtr rmesa );
++extern void radeonSetUpAtomList( r100ContextPtr rmesa );
+ 
+ /* ================================================================
+  * Helper macros:
+@@ -105,33 +91,33 @@ extern void radeonSetUpAtomList( radeonContextPtr rmesa );
+  */
+ #define RADEON_NEWPRIM( rmesa )			\
+ do {						\
+-   if ( rmesa->dma.flush )			\
+-      rmesa->dma.flush( rmesa );	\
++   if ( rmesa->radeon.dma.flush )			\
++      rmesa->radeon.dma.flush( rmesa->radeon.glCtx );	\
+ } while (0)
+ 
+ /* Can accomodate several state changes and primitive changes without
+  * actually firing the buffer.
+  */
++
+ #define RADEON_STATECHANGE( rmesa, ATOM )			\
+ do {								\
+    RADEON_NEWPRIM( rmesa );					\
+    rmesa->hw.ATOM.dirty = GL_TRUE;				\
+-   rmesa->hw.is_dirty = GL_TRUE;				\
++   rmesa->radeon.hw.is_dirty = GL_TRUE;				\
+ } while (0)
+ 
+-#define RADEON_DB_STATE( ATOM )			        \
++#define RADEON_DB_STATE( ATOM )				\
+    memcpy( rmesa->hw.ATOM.lastcmd, rmesa->hw.ATOM.cmd,	\
+ 	   rmesa->hw.ATOM.cmd_size * 4)
+ 
+-static INLINE int RADEON_DB_STATECHANGE( 
+-   radeonContextPtr rmesa,
+-   struct radeon_state_atom *atom )
++static INLINE int RADEON_DB_STATECHANGE(r100ContextPtr rmesa,
++					struct radeon_state_atom *atom )
+ {
+    if (memcmp(atom->cmd, atom->lastcmd, atom->cmd_size*4)) {
+-      int *tmp;
++      GLuint *tmp;
+       RADEON_NEWPRIM( rmesa );
+       atom->dirty = GL_TRUE;
+-      rmesa->hw.is_dirty = GL_TRUE;
++      rmesa->radeon.hw.is_dirty = GL_TRUE;
+       tmp = atom->cmd; 
+       atom->cmd = atom->lastcmd;
+       atom->lastcmd = tmp;
+@@ -141,16 +127,6 @@ static INLINE int RADEON_DB_STATECHANGE(
+       return 0;
+ }
+ 
+-
+-/* Fire the buffered vertices no matter what.
+- */
+-#define RADEON_FIREVERTICES( rmesa )			\
+-do {							\
+-   if ( rmesa->store.cmd_used || rmesa->dma.flush ) {	\
+-      radeonFlush( rmesa->glCtx );			\
+-   }							\
+-} while (0)
+-
+ /* Command lengths.  Note that any time you ensure ELTS_BUFSZ or VBUF_BUFSZ
+  * are available, you will also be adding an rmesa->state.max_state_size because
+  * r200EmitState is called from within r200EmitVbufPrim and r200FlushElts.
+@@ -167,36 +143,37 @@ do {							\
+ #define VBUF_BUFSZ	(4 * sizeof(int))
+ #endif
+ 
+-/* Ensure that a minimum amount of space is available in the command buffer.
+- * This is used to ensure atomicity of state updates with the rendering requests
+- * that rely on them.
+- *
+- * An alternative would be to implement a "soft lock" such that when the buffer
+- * wraps at an inopportune time, we grab the lock, flush the current buffer,
+- * and hang on to the lock until the critical section is finished and we flush
+- * the buffer again and unlock.
+- */
+-static INLINE void radeonEnsureCmdBufSpace( radeonContextPtr rmesa,
+-					      int bytes )
+-{
+-   if (rmesa->store.cmd_used + bytes > RADEON_CMD_BUF_SZ)
+-      radeonFlushCmdBuf( rmesa, __FUNCTION__ );
+-   assert( bytes <= RADEON_CMD_BUF_SZ );
+-}
+ 
+-/* Alloc space in the command buffer
+- */
+-static INLINE char *radeonAllocCmdBuf( radeonContextPtr rmesa,
+-					 int bytes, const char *where )
++static inline uint32_t cmdpacket3(int cmd_type)
+ {
+-   if (rmesa->store.cmd_used + bytes > RADEON_CMD_BUF_SZ)
+-      radeonFlushCmdBuf( rmesa, __FUNCTION__ );
++  drm_radeon_cmd_header_t cmd;
++
++  cmd.i = 0;
++  cmd.header.cmd_type = cmd_type;
++
++  return (uint32_t)cmd.i;
+ 
+-   {
+-      char *head = rmesa->store.cmd_buf + rmesa->store.cmd_used;
+-      rmesa->store.cmd_used += bytes;
+-      return head;
+-   }
+ }
+ 
++#define OUT_BATCH_PACKET3(packet, num_extra) do {	      \
++    if (!b_l_rmesa->radeonScreen->kernel_mm) {		      \
++      OUT_BATCH(cmdpacket3(RADEON_CMD_PACKET3));				      \
++      OUT_BATCH(CP_PACKET3((packet), (num_extra)));	      \
++    } else {						      \
++      OUT_BATCH(CP_PACKET2);				      \
++      OUT_BATCH(CP_PACKET3((packet), (num_extra)));	      \
++    }							      \
++  } while(0)
++
++#define OUT_BATCH_PACKET3_CLIP(packet, num_extra) do {	      \
++    if (!b_l_rmesa->radeonScreen->kernel_mm) {		      \
++      OUT_BATCH(cmdpacket3(RADEON_CMD_PACKET3_CLIP));	      \
++      OUT_BATCH(CP_PACKET3((packet), (num_extra)));	      \
++    } else {						      \
++      OUT_BATCH(CP_PACKET2);				      \
++      OUT_BATCH(CP_PACKET3((packet), (num_extra)));	      \
++    }							      \
++  } while(0)
++
++
+ #endif /* __RADEON_IOCTL_H__ */
+diff --git a/src/mesa/drivers/dri/radeon/radeon_lock.c b/src/mesa/drivers/dri/radeon/radeon_lock.c
+index 64bb3ca..9a7e76b 100644
+--- a/src/mesa/drivers/dri/radeon/radeon_lock.c
++++ b/src/mesa/drivers/dri/radeon/radeon_lock.c
+@@ -41,12 +41,11 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ #include "main/glheader.h"
+ #include "main/mtypes.h"
+-#include "radeon_context.h"
++#include "main/colormac.h"
++#include "dri_util.h"
++#include "radeon_screen.h"
++#include "radeon_common.h"
+ #include "radeon_lock.h"
+-#include "radeon_tex.h"
+-#include "radeon_state.h"
+-#include "radeon_ioctl.h"
+-
+ #include "drirenderbuffer.h"
+ 
+ #if DEBUG_LOCKING
+@@ -56,13 +55,28 @@ int prevLockLine = 0;
+ 
+ /* Turn on/off page flipping according to the flags in the sarea:
+  */
+-static void radeonUpdatePageFlipping(radeonContextPtr rmesa)
++void radeonUpdatePageFlipping(radeonContextPtr rmesa)
+ {
++	int use_back;
++	__DRIdrawablePrivate *const drawable = rmesa->dri.drawable;
++	GLframebuffer *fb = drawable->driverPrivate;
++
+ 	rmesa->doPageFlip = rmesa->sarea->pfState;
+ 	if (rmesa->glCtx->WinSysDrawBuffer) {
+-		driFlipRenderbuffers(rmesa->glCtx->WinSysDrawBuffer,
+-				     rmesa->sarea->pfCurrentPage);
++		rmesa->vtbl.update_draw_buffer(rmesa->glCtx);
+ 	}
 +
-+	if (RADEON_DEBUG & DEBUG_VERTS)
-+		fprintf(stderr, "%s count %d stride %d out %p data %p\n",
-+			__FUNCTION__, count, stride, (void *)out, (void *)data);
++	use_back = rmesa->glCtx->DrawBuffer ?
++	    (rmesa->glCtx->DrawBuffer->_ColorDrawBufferIndexes[0] ==
++	     BUFFER_BACK_LEFT) : 1;
++	use_back ^= (rmesa->sarea->pfCurrentPage == 1);
 +
-+	if (stride == 8)
-+		COPY_DWORDS(out, data, count * 2);
++	if (use_back)
++		rmesa->state.color.rrb = (void *)fb->Attachment[BUFFER_BACK_LEFT].Renderbuffer;
 +	else
-+		for (i = 0; i < count; i++) {
-+			out[0] = *(int *)data;
-+			out[1] = *(int *)(data + 4);
-+			out += 2;
-+			data += stride;
-+		}
-+}
++		rmesa->state.color.rrb = (void *)fb->Attachment[BUFFER_FRONT_LEFT].Renderbuffer;
++
++	rmesa->state.depth.rrb = (void *)fb->Attachment[BUFFER_DEPTH].Renderbuffer;
+ }
+ 
+ /* Update the hardware state.  This is called if another context has
+@@ -80,6 +94,8 @@ void radeonGetLock(radeonContextPtr rmesa, GLuint flags)
+ 	__DRIscreenPrivate *sPriv = rmesa->dri.screen;
+ 	drm_radeon_sarea_t *sarea = rmesa->sarea;
+ 
++	assert(drawable != NULL);
++
+ 	drmGetLock(rmesa->dri.fd, rmesa->dri.hwContext, flags);
+ 
+ 	/* The window might have moved, so we might need to get new clip
+@@ -98,27 +114,11 @@ void radeonGetLock(radeonContextPtr rmesa, GLuint flags)
+ 	if (rmesa->lastStamp != drawable->lastStamp) {
+ 		radeonUpdatePageFlipping(rmesa);
+ 		radeonSetCliprects(rmesa);
+-		radeonUpdateViewportOffset(rmesa->glCtx);
++		rmesa->vtbl.update_viewport_offset(rmesa->glCtx);
+ 		driUpdateFramebufferSize(rmesa->glCtx, drawable);
+ 	}
+ 
+-	RADEON_STATECHANGE(rmesa, ctx);
+-	if (rmesa->sarea->tiling_enabled) {
+-		rmesa->hw.ctx.cmd[CTX_RB3D_COLORPITCH] |=
+-		    RADEON_COLOR_TILE_ENABLE;
+-	} else {
+-		rmesa->hw.ctx.cmd[CTX_RB3D_COLORPITCH] &=
+-		    ~RADEON_COLOR_TILE_ENABLE;
+-	}
+-
+-	if (sarea->ctx_owner != rmesa->dri.hwContext) {
+-		int i;
+-		sarea->ctx_owner = rmesa->dri.hwContext;
+-
+-		for (i = 0; i < rmesa->nr_heaps; i++) {
+-			DRI_AGE_TEXTURES(rmesa->texture_heaps[i]);
+-		}
+-	}
++	rmesa->vtbl.get_lock(rmesa);
+ 
+ 	rmesa->lost_context = GL_TRUE;
+ }
+diff --git a/src/mesa/drivers/dri/radeon/radeon_lock.h b/src/mesa/drivers/dri/radeon/radeon_lock.h
+index 86e96aa..f5ebb8d 100644
+--- a/src/mesa/drivers/dri/radeon/radeon_lock.h
++++ b/src/mesa/drivers/dri/radeon/radeon_lock.h
+@@ -39,8 +39,12 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+  *   Kevin E. Martin <martin@valinux.com>
+  */
+ 
+-#ifndef __RADEON_LOCK_H__
+-#define __RADEON_LOCK_H__
++#ifndef COMMON_LOCK_H
++#define COMMON_LOCK_H
 +
-+void radeonEmitVec12(uint32_t *out, GLvoid * data, int stride, int count)
-+{
-+	int i;
++#include "main/colormac.h"
++#include "radeon_screen.h"
++#include "radeon_common.h"
+ 
+ extern void radeonGetLock(radeonContextPtr rmesa, GLuint flags);
+ 
+@@ -94,19 +98,23 @@ extern int prevLockLine;
+    do {								\
+       char __ret = 0;						\
+       DEBUG_CHECK_LOCK();					\
+-      DRM_CAS( (rmesa)->dri.hwLock, (rmesa)->dri.hwContext,		\
+-	       (DRM_LOCK_HELD | (rmesa)->dri.hwContext), __ret );	\
+-      if ( __ret )						\
+-	 radeonGetLock( (rmesa), 0 );				\
+-      DEBUG_LOCK();						\
++      if (!(rmesa)->radeonScreen->driScreen->dri2.enabled) {		\
++	DRM_CAS( (rmesa)->dri.hwLock, (rmesa)->dri.hwContext,		\
++		 (DRM_LOCK_HELD | (rmesa)->dri.hwContext), __ret );	\
++	if ( __ret )							\
++	  radeonGetLock( (rmesa), 0 );					\
++      }									\
++      DEBUG_LOCK();							\
+    } while (0)
+ 
+ #define UNLOCK_HARDWARE( rmesa )					\
+    do {									\
+-      DRM_UNLOCK( (rmesa)->dri.fd,					\
+-		  (rmesa)->dri.hwLock,					\
+-		  (rmesa)->dri.hwContext );				\
+-      DEBUG_RESET();							\
++     if (!(rmesa)->radeonScreen->driScreen->dri2.enabled) {		\
++       DRM_UNLOCK( (rmesa)->dri.fd,					\
++		   (rmesa)->dri.hwLock,					\
++		   (rmesa)->dri.hwContext );				\
++       DEBUG_RESET();							\
++     }									\
+    } while (0)
+ 
+-#endif				/* __RADEON_LOCK_H__ */
++#endif
+diff --git a/src/mesa/drivers/dri/radeon/radeon_maos_arrays.c b/src/mesa/drivers/dri/radeon/radeon_maos_arrays.c
+index de3c3a1..7f5da16 100644
+--- a/src/mesa/drivers/dri/radeon/radeon_maos_arrays.c
++++ b/src/mesa/drivers/dri/radeon/radeon_maos_arrays.c
+@@ -40,7 +40,6 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ #include "swrast_setup/swrast_setup.h"
+ #include "math/m_translate.h"
+ #include "tnl/tnl.h"
+-#include "tnl/tcontext.h"
+ 
+ #include "radeon_context.h"
+ #include "radeon_ioctl.h"
+@@ -49,160 +48,35 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ #include "radeon_maos.h"
+ #include "radeon_tcl.h"
+ 
+-#if 0
+-/* Usage:
+- *   - from radeon_tcl_render
+- *   - call radeonEmitArrays to ensure uptodate arrays in dma
+- *   - emit primitives (new type?) which reference the data
+- *       -- need to use elts for lineloop, quads, quadstrip/flat
+- *       -- other primitives are all well-formed (need tristrip-1,fake-poly)
+- *
+- */
+-static void emit_ubyte_rgba3( GLcontext *ctx,
+-		       struct radeon_dma_region *rvb,
+-		       char *data,
+-		       int stride,
+-		       int count )
++static void emit_vecfog(GLcontext *ctx, struct radeon_aos *aos,
++			GLvoid *data, int stride, int count)
+ {
+    int i;
+-   radeon_color_t *out = (radeon_color_t *)(rvb->start + rvb->address);
+-
+-   if (RADEON_DEBUG & DEBUG_VERTS)
+-      fprintf(stderr, "%s count %d stride %d out %p\n",
+-	      __FUNCTION__, count, stride, (void *)out);
+-
+-   for (i = 0; i < count; i++) {
+-      out->red   = *data;
+-      out->green = *(data+1);
+-      out->blue  = *(data+2);
+-      out->alpha = 0xFF;
+-      out++;
+-      data += stride;
+-   }
+-}
+-
+-static void emit_ubyte_rgba4( GLcontext *ctx,
+-			      struct radeon_dma_region *rvb,
+-			      char *data,
+-			      int stride,
+-			      int count )
+-{
+-   int i;
+-   int *out = (int *)(rvb->address + rvb->start);
++   uint32_t *out;
++   int size = 1;
++   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+ 
+    if (RADEON_DEBUG & DEBUG_VERTS)
+       fprintf(stderr, "%s count %d stride %d\n",
+ 	      __FUNCTION__, count, stride);
+ 
+-   if (stride == 4)
+-       COPY_DWORDS( out, data, count );
+-   else
+-      for (i = 0; i < count; i++) {
+-	 *out++ = LE32_TO_CPU(*(int *)data);
+-	 data += stride;
+-      }
+-}
+-
+-
+-static void emit_ubyte_rgba( GLcontext *ctx,
+-			     struct radeon_dma_region *rvb,
+-			     char *data,
+-			     int size,
+-			     int stride,
+-			     int count )
+-{
+-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+-
+-   if (RADEON_DEBUG & DEBUG_VERTS)
+-      fprintf(stderr, "%s %d/%d\n", __FUNCTION__, count, size);
+-
+-   assert (!rvb->buf);
+-
+    if (stride == 0) {
+-      radeonAllocDmaRegion( rmesa, rvb, 4, 4 );
++      radeonAllocDmaRegion( rmesa, &aos->bo, &aos->offset, size * 4, 32 );
+       count = 1;
+-      rvb->aos_start = GET_START(rvb);
+-      rvb->aos_stride = 0;
+-      rvb->aos_size = 1;
++      aos->stride = 0;
+    }
+    else {
+-      radeonAllocDmaRegion( rmesa, rvb, 4 * count, 4 );	/* alignment? */
+-      rvb->aos_start = GET_START(rvb);
+-      rvb->aos_stride = 1;
+-      rvb->aos_size = 1;
++      radeonAllocDmaRegion(rmesa, &aos->bo, &aos->offset, size * 4, 32);
++      aos->stride = size;
+    }
+ 
+-   /* Emit the data
+-    */
+-   switch (size) {
+-   case 3:
+-      emit_ubyte_rgba3( ctx, rvb, data, stride, count );
+-      break;
+-   case 4:
+-      emit_ubyte_rgba4( ctx, rvb, data, stride, count );
+-      break;
+-   default:
+-      assert(0);
+-      exit(1);
+-      break;
+-   }
+-}
+-#endif
+-
+-#if defined(USE_X86_ASM)
+-#define COPY_DWORDS( dst, src, nr )					\
+-do {									\
+-	int __tmp;							\
+-	__asm__ __volatile__( "rep ; movsl"				\
+-			      : "=%c" (__tmp), "=D" (dst), "=S" (__tmp)	\
+-			      : "0" (nr),				\
+-			        "D" ((long)dst),			\
+-			        "S" ((long)src) );			\
+-} while (0)
+-#else
+-#define COPY_DWORDS( dst, src, nr )		\
+-do {						\
+-   int j;					\
+-   for ( j = 0 ; j < nr ; j++ )			\
+-      dst[j] = ((int *)src)[j];			\
+-   dst += nr;					\
+-} while (0)
+-#endif
+-
+-static void emit_vecfog( GLcontext *ctx,
+-			 struct radeon_dma_region *rvb,
+-			 char *data,
+-			 int stride,
+-			 int count )
+-{
+-   int i;
+-   GLfloat *out;
+-
+-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
++   aos->components = size;
++   aos->count = count;
+ 
+-   if (RADEON_DEBUG & DEBUG_VERTS)
+-      fprintf(stderr, "%s count %d stride %d\n",
+-	      __FUNCTION__, count, stride);
+-
+-   assert (!rvb->buf);
+-
+-   if (stride == 0) {
+-      radeonAllocDmaRegion( rmesa, rvb, 4, 4 );
+-      count = 1;
+-      rvb->aos_start = GET_START(rvb);
+-      rvb->aos_stride = 0;
+-      rvb->aos_size = 1;
+-   }
+-   else {
+-      radeonAllocDmaRegion( rmesa, rvb, count * 4, 4 );	/* alignment? */
+-      rvb->aos_start = GET_START(rvb);
+-      rvb->aos_stride = 1;
+-      rvb->aos_size = 1;
+-   }
+ 
+    /* Emit the data
+     */
+-   out = (GLfloat *)(rvb->address + rvb->start);
++   out = (uint32_t*)((char*)aos->bo->ptr + aos->offset);
+    for (i = 0; i < count; i++) {
+       out[0] = radeonComputeFogBlendFactor( ctx, *(GLfloat *)data );
+       out++;
+@@ -210,169 +84,9 @@ static void emit_vecfog( GLcontext *ctx,
+    }
+ }
+ 
+-static void emit_vec4( GLcontext *ctx,
+-		       struct radeon_dma_region *rvb,
+-		       char *data,
+-		       int stride,
+-		       int count )
+-{
+-   int i;
+-   int *out = (int *)(rvb->address + rvb->start);
+-
+-   if (RADEON_DEBUG & DEBUG_VERTS)
+-      fprintf(stderr, "%s count %d stride %d\n",
+-	      __FUNCTION__, count, stride);
+-
+-   if (stride == 4)
+-      COPY_DWORDS( out, data, count );
+-   else
+-      for (i = 0; i < count; i++) {
+-	 out[0] = *(int *)data;
+-	 out++;
+-	 data += stride;
+-      }
+-}
+-
+-
+-static void emit_vec8( GLcontext *ctx,
+-		       struct radeon_dma_region *rvb,
+-		       char *data,
+-		       int stride,
+-		       int count )
+-{
+-   int i;
+-   int *out = (int *)(rvb->address + rvb->start);
+-
+-   if (RADEON_DEBUG & DEBUG_VERTS)
+-      fprintf(stderr, "%s count %d stride %d\n",
+-	      __FUNCTION__, count, stride);
+-
+-   if (stride == 8)
+-      COPY_DWORDS( out, data, count*2 );
+-   else
+-      for (i = 0; i < count; i++) {
+-	 out[0] = *(int *)data;
+-	 out[1] = *(int *)(data+4);
+-	 out += 2;
+-	 data += stride;
+-      }
+-}
+-
+-static void emit_vec12( GLcontext *ctx,
+-		       struct radeon_dma_region *rvb,
+-		       char *data,
+-		       int stride,
+-		       int count )
+-{
+-   int i;
+-   int *out = (int *)(rvb->address + rvb->start);
+-
+-   if (RADEON_DEBUG & DEBUG_VERTS)
+-      fprintf(stderr, "%s count %d stride %d out %p data %p\n",
+-	      __FUNCTION__, count, stride, (void *)out, (void *)data);
+-
+-   if (stride == 12)
+-      COPY_DWORDS( out, data, count*3 );
+-   else
+-      for (i = 0; i < count; i++) {
+-	 out[0] = *(int *)data;
+-	 out[1] = *(int *)(data+4);
+-	 out[2] = *(int *)(data+8);
+-	 out += 3;
+-	 data += stride;
+-      }
+-}
+-
+-static void emit_vec16( GLcontext *ctx,
+-			struct radeon_dma_region *rvb,
+-			char *data,
+-			int stride,
+-			int count )
+-{
+-   int i;
+-   int *out = (int *)(rvb->address + rvb->start);
+-
+-   if (RADEON_DEBUG & DEBUG_VERTS)
+-      fprintf(stderr, "%s count %d stride %d\n",
+-	      __FUNCTION__, count, stride);
+-
+-   if (stride == 16)
+-      COPY_DWORDS( out, data, count*4 );
+-   else
+-      for (i = 0; i < count; i++) {
+-	 out[0] = *(int *)data;
+-	 out[1] = *(int *)(data+4);
+-	 out[2] = *(int *)(data+8);
+-	 out[3] = *(int *)(data+12);
+-	 out += 4;
+-	 data += stride;
+-      }
+-}
+-
+-
+-static void emit_vector( GLcontext *ctx,
+-			 struct radeon_dma_region *rvb,
+-			 char *data,
+-			 int size,
+-			 int stride,
+-			 int count )
+-{
+-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+-
+-   if (RADEON_DEBUG & DEBUG_VERTS)
+-      fprintf(stderr, "%s count %d size %d stride %d\n",
+-	      __FUNCTION__, count, size, stride);
+-
+-   assert (!rvb->buf);
+-
+-   if (stride == 0) {
+-      radeonAllocDmaRegion( rmesa, rvb, size * 4, 4 );
+-      count = 1;
+-      rvb->aos_start = GET_START(rvb);
+-      rvb->aos_stride = 0;
+-      rvb->aos_size = size;
+-   }
+-   else {
+-      radeonAllocDmaRegion( rmesa, rvb, size * count * 4, 4 );	/* alignment? */
+-      rvb->aos_start = GET_START(rvb);
+-      rvb->aos_stride = size;
+-      rvb->aos_size = size;
+-   }
+-
+-   /* Emit the data
+-    */
+-   switch (size) {
+-   case 1:
+-      emit_vec4( ctx, rvb, data, stride, count );
+-      break;
+-   case 2:
+-      emit_vec8( ctx, rvb, data, stride, count );
+-      break;
+-   case 3:
+-      emit_vec12( ctx, rvb, data, stride, count );
+-      break;
+-   case 4:
+-      emit_vec16( ctx, rvb, data, stride, count );
+-      break;
+-   default:
+-      assert(0);
+-      exit(1);
+-      break;
+-   }
+-
+-}
+-
+-
+-
+-static void emit_s0_vec( GLcontext *ctx,
+-			 struct radeon_dma_region *rvb,
+-			 char *data,
+-			 int stride,
+-			 int count )
++static void emit_s0_vec(uint32_t *out, GLvoid *data, int stride, int count)
+ {
+    int i;
+-   int *out = (int *)(rvb->address + rvb->start);
+-
+    if (RADEON_DEBUG & DEBUG_VERTS)
+       fprintf(stderr, "%s count %d stride %d\n",
+ 	      __FUNCTION__, count, stride);
+@@ -385,14 +99,9 @@ static void emit_s0_vec( GLcontext *ctx,
+    }
+ }
+ 
+-static void emit_stq_vec( GLcontext *ctx,
+-			 struct radeon_dma_region *rvb,
+-			 char *data,
+-			 int stride,
+-			 int count )
++static void emit_stq_vec(uint32_t *out, GLvoid *data, int stride, int count)
+ {
+    int i;
+-   int *out = (int *)(rvb->address + rvb->start);
+ 
+    if (RADEON_DEBUG & DEBUG_VERTS)
+       fprintf(stderr, "%s count %d stride %d\n",
+@@ -410,21 +119,16 @@ static void emit_stq_vec( GLcontext *ctx,
+ 
+ 
+ 
+-static void emit_tex_vector( GLcontext *ctx,
+-			     struct radeon_dma_region *rvb,
+-			     char *data,
+-			     int size,
+-			     int stride,
+-			     int count )
++static void emit_tex_vector(GLcontext *ctx, struct radeon_aos *aos,
++			    GLvoid *data, int size, int stride, int count)
+ {
+    radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+    int emitsize;
++   uint32_t *out;
+ 
+    if (RADEON_DEBUG & DEBUG_VERTS)
+       fprintf(stderr, "%s %d/%d\n", __FUNCTION__, count, size);
+ 
+-   assert (!rvb->buf);
+-
+    switch (size) {
+    case 4: emitsize = 3; break;
+    case 3: emitsize = 3; break;
+@@ -433,34 +137,33 @@ static void emit_tex_vector( GLcontext *ctx,
+ 
+ 
+    if (stride == 0) {
+-      radeonAllocDmaRegion( rmesa, rvb, 4 * emitsize, 4 );
++      radeonAllocDmaRegion(rmesa, &aos->bo, &aos->offset, emitsize * 4, 32);
+       count = 1;
+-      rvb->aos_start = GET_START(rvb);
+-      rvb->aos_stride = 0;
+-      rvb->aos_size = emitsize;
++      aos->stride = 0;
+    }
+    else {
+-      radeonAllocDmaRegion( rmesa, rvb, 4 * emitsize * count, 4 );
+-      rvb->aos_start = GET_START(rvb);
+-      rvb->aos_stride = emitsize;
+-      rvb->aos_size = emitsize;
++      radeonAllocDmaRegion(rmesa, &aos->bo, &aos->offset, emitsize * count * 4, 32);
++      aos->stride = emitsize;
+    }
+ 
++   aos->components = emitsize;
++   aos->count = count;
+ 
+    /* Emit the data
+     */
++   out = (uint32_t*)((char*)aos->bo->ptr + aos->offset);
+    switch (size) {
+    case 1:
+-      emit_s0_vec( ctx, rvb, data, stride, count ); 
++      emit_s0_vec( out, data, stride, count );
+       break;
+    case 2:
+-      emit_vec8( ctx, rvb, data, stride, count );
++      radeonEmitVec8( out, data, stride, count );
+       break;
+    case 3:
+-      emit_vec12( ctx, rvb, data, stride, count );
++      radeonEmitVec12( out, data, stride, count );
+       break;
+    case 4:
+-      emit_stq_vec( ctx, rvb, data, stride, count );
++      emit_stq_vec( out, data, stride, count );
+       break;
+    default:
+       assert(0);
+@@ -477,9 +180,8 @@ static void emit_tex_vector( GLcontext *ctx,
+  */
+ void radeonEmitArrays( GLcontext *ctx, GLuint inputs )
+ {
+-   radeonContextPtr rmesa = RADEON_CONTEXT( ctx );
++   r100ContextPtr rmesa = R100_CONTEXT( ctx );
+    struct vertex_buffer *VB = &TNL_CONTEXT( ctx )->vb;
+-   struct radeon_dma_region **component = rmesa->tcl.aos_components;
+    GLuint nr = 0;
+    GLuint vfmt = 0;
+    GLuint count = VB->Count;
+@@ -492,12 +194,12 @@ void radeonEmitArrays( GLcontext *ctx, GLuint inputs )
+ 
+    if (1) {
+       if (!rmesa->tcl.obj.buf) 
+-	 emit_vector( ctx, 
+-		      &rmesa->tcl.obj, 
+-		      (char *)VB->ObjPtr->data,
+-		      VB->ObjPtr->size,
+-		      VB->ObjPtr->stride,
+-		      count);
++	rcommon_emit_vector( ctx, 
++			     &(rmesa->tcl.aos[nr]),
++			     (char *)VB->ObjPtr->data,
++			     VB->ObjPtr->size,
++			     VB->ObjPtr->stride,
++			     count);
+ 
+       switch( VB->ObjPtr->size ) {
+       case 4: vfmt |= RADEON_CP_VC_FRMT_W0;
+@@ -506,21 +208,21 @@ void radeonEmitArrays( GLcontext *ctx, GLuint inputs )
+       default:
+          break;
+       }
+-      component[nr++] = &rmesa->tcl.obj;
++      nr++;
+    }
+    
+ 
+    if (inputs & VERT_BIT_NORMAL) {
+       if (!rmesa->tcl.norm.buf)
+-	 emit_vector( ctx, 
+-		      &(rmesa->tcl.norm), 
+-		      (char *)VB->NormalPtr->data,
+-		      3,
+-		      VB->NormalPtr->stride,
+-		      count);
++	 rcommon_emit_vector( ctx, 
++			      &(rmesa->tcl.aos[nr]),
++			      (char *)VB->NormalPtr->data,
++			      3,
++			      VB->NormalPtr->stride,
++			      count);
+ 
+       vfmt |= RADEON_CP_VC_FRMT_N0;
+-      component[nr++] = &rmesa->tcl.norm;
++      nr++;
+    }
+ 
+    if (inputs & VERT_BIT_COLOR0) {
+@@ -538,31 +240,30 @@ void radeonEmitArrays( GLcontext *ctx, GLuint inputs )
+       }
+ 
+       if (!rmesa->tcl.rgba.buf)
+-	 emit_vector( ctx,
+-		      &(rmesa->tcl.rgba),
+-		      (char *)VB->ColorPtr[0]->data,
+-		      emitsize,
+-		      VB->ColorPtr[0]->stride,
+-		      count);
+-
+-
+-      component[nr++] = &rmesa->tcl.rgba;
++	rcommon_emit_vector( ctx,
++			     &(rmesa->tcl.aos[nr]),
++			     (char *)VB->ColorPtr[0]->data,
++			     emitsize,
++			     VB->ColorPtr[0]->stride,
++			     count);
++
++      nr++;
+    }
+ 
+ 
+    if (inputs & VERT_BIT_COLOR1) {
+       if (!rmesa->tcl.spec.buf) {
+ 
+-	 emit_vector( ctx,
+-		      &rmesa->tcl.spec,
+-		      (char *)VB->SecondaryColorPtr[0]->data,
+-		      3,
+-		      VB->SecondaryColorPtr[0]->stride,
+-		      count);
++	rcommon_emit_vector( ctx,
++			     &(rmesa->tcl.aos[nr]),
++			     (char *)VB->SecondaryColorPtr[0]->data,
++			     3,
++			     VB->SecondaryColorPtr[0]->stride,
++			     count);
+       }
+ 
+       vfmt |= RADEON_CP_VC_FRMT_FPSPEC;
+-      component[nr++] = &rmesa->tcl.spec;
++      nr++;
+    }
+ 
+ /* FIXME: not sure if this is correct. May need to stitch this together with
+@@ -571,13 +272,13 @@ void radeonEmitArrays( GLcontext *ctx, GLuint inputs )
+    if (inputs & VERT_BIT_FOG) {
+       if (!rmesa->tcl.fog.buf)
+ 	 emit_vecfog( ctx,
+-		      &(rmesa->tcl.fog),
++		      &(rmesa->tcl.aos[nr]),
+ 		      (char *)VB->FogCoordPtr->data,
+ 		      VB->FogCoordPtr->stride,
+ 		      count);
+ 
+       vfmt |= RADEON_CP_VC_FRMT_FPFOG;
+-      component[nr++] = &rmesa->tcl.fog;
++      nr++;
+    }
+ 
+ 
+@@ -588,11 +289,12 @@ void radeonEmitArrays( GLcontext *ctx, GLuint inputs )
+       if (inputs & VERT_BIT_TEX(unit)) {
+ 	 if (!rmesa->tcl.tex[unit].buf)
+ 	    emit_tex_vector( ctx,
+-			     &(rmesa->tcl.tex[unit]),
++			     &(rmesa->tcl.aos[nr]),
+ 			     (char *)VB->TexCoordPtr[unit]->data,
+ 			     VB->TexCoordPtr[unit]->size,
+ 			     VB->TexCoordPtr[unit]->stride,
+ 			     count );
++	 nr++;
+ 
+ 	 vfmt |= RADEON_ST_BIT(unit);
+          /* assume we need the 3rd coord if texgen is active for r/q OR at least
+@@ -610,7 +312,6 @@ void radeonEmitArrays( GLcontext *ctx, GLuint inputs )
+ 		 (swaptexmatcol != ((rmesa->TexMatColSwap >> unit) & 1)))
+ 	       radeonUploadTexMatrix( rmesa, unit, swaptexmatcol ) ;
+ 	 }
+-	 component[nr++] = &rmesa->tcl.tex[unit];
+       }
+    }
+ 
+@@ -626,31 +327,13 @@ void radeonEmitArrays( GLcontext *ctx, GLuint inputs )
+ 
+ void radeonReleaseArrays( GLcontext *ctx, GLuint newinputs )
+ {
+-   radeonContextPtr rmesa = RADEON_CONTEXT( ctx );
+-   GLuint unit;
+-
+-#if 0
+-   if (RADEON_DEBUG & DEBUG_VERTS) 
+-      _tnl_print_vert_flags( __FUNCTION__, newinputs );
+-#endif
+-
+-   if (newinputs & VERT_BIT_POS) 
+-     radeonReleaseDmaRegion( rmesa, &rmesa->tcl.obj, __FUNCTION__ );
+-
+-   if (newinputs & VERT_BIT_NORMAL) 
+-      radeonReleaseDmaRegion( rmesa, &rmesa->tcl.norm, __FUNCTION__ );
+-
+-   if (newinputs & VERT_BIT_COLOR0) 
+-      radeonReleaseDmaRegion( rmesa, &rmesa->tcl.rgba, __FUNCTION__ );
+-
+-   if (newinputs & VERT_BIT_COLOR1) 
+-      radeonReleaseDmaRegion( rmesa, &rmesa->tcl.spec, __FUNCTION__ );
+-      
+-   if (newinputs & VERT_BIT_FOG)
+-      radeonReleaseDmaRegion( rmesa, &rmesa->tcl.fog, __FUNCTION__ );
++   r100ContextPtr rmesa = R100_CONTEXT( ctx );
++   int i;
+ 
+-   for (unit = 0 ; unit < ctx->Const.MaxTextureUnits; unit++) {
+-      if (newinputs & VERT_BIT_TEX(unit))
+-         radeonReleaseDmaRegion( rmesa, &rmesa->tcl.tex[unit], __FUNCTION__ );
++   for (i = 0; i < rmesa->tcl.nr_aos_components; i++) {
++     if (rmesa->tcl.aos[i].bo) {
++       radeon_bo_unref(rmesa->tcl.aos[i].bo);
++       rmesa->tcl.aos[i].bo = NULL;
++     }
+    }
+ }
+diff --git a/src/mesa/drivers/dri/radeon/radeon_maos_verts.c b/src/mesa/drivers/dri/radeon/radeon_maos_verts.c
+index 126d072..d468a97 100644
+--- a/src/mesa/drivers/dri/radeon/radeon_maos_verts.c
++++ b/src/mesa/drivers/dri/radeon/radeon_maos_verts.c
+@@ -310,7 +310,7 @@ static void init_tcl_verts( void )
+ 
+ void radeonEmitArrays( GLcontext *ctx, GLuint inputs )
+ {
+-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
++   r100ContextPtr rmesa = R100_CONTEXT(ctx);
+    struct vertex_buffer *VB = &TNL_CONTEXT(ctx)->vb;
+    GLuint req = 0;
+    GLuint unit;
+@@ -374,14 +374,15 @@ void radeonEmitArrays( GLcontext *ctx, GLuint inputs )
+ 	 break;
+ 
+    if (rmesa->tcl.vertex_format == setup_tab[i].vertex_format &&
+-       rmesa->tcl.indexed_verts.buf)
++       rmesa->tcl.aos[0].bo)
+       return;
+ 
+-   if (rmesa->tcl.indexed_verts.buf)
++   if (rmesa->tcl.aos[0].bo)
+       radeonReleaseArrays( ctx, ~0 );
+ 
+-   radeonAllocDmaRegion( rmesa,
+-			 &rmesa->tcl.indexed_verts, 
++   radeonAllocDmaRegion( &rmesa->radeon,
++			 &rmesa->tcl.aos[0].bo,
++			 &rmesa->tcl.aos[0].offset,
+ 			 VB->Count * setup_tab[i].vertex_size * 4, 
+ 			 4);
+ 
+@@ -421,15 +422,11 @@ void radeonEmitArrays( GLcontext *ctx, GLuint inputs )
+ 
+ 
+    setup_tab[i].emit( ctx, 0, VB->Count, 
+-		      rmesa->tcl.indexed_verts.address + 
+-		      rmesa->tcl.indexed_verts.start );
++		      rmesa->tcl.aos[0].bo->ptr + rmesa->tcl.aos[0].offset);
+ 
++   //   rmesa->tcl.aos[0].size = setup_tab[i].vertex_size;
++   rmesa->tcl.aos[0].stride = setup_tab[i].vertex_size;
+    rmesa->tcl.vertex_format = setup_tab[i].vertex_format;
+-   rmesa->tcl.indexed_verts.aos_start = GET_START( &rmesa->tcl.indexed_verts );
+-   rmesa->tcl.indexed_verts.aos_size = setup_tab[i].vertex_size;
+-   rmesa->tcl.indexed_verts.aos_stride = setup_tab[i].vertex_size;
+-
+-   rmesa->tcl.aos_components[0] = &rmesa->tcl.indexed_verts;
+    rmesa->tcl.nr_aos_components = 1;
+ }
+ 
+@@ -437,13 +434,13 @@ void radeonEmitArrays( GLcontext *ctx, GLuint inputs )
+ 
+ void radeonReleaseArrays( GLcontext *ctx, GLuint newinputs )
+ {
+-   radeonContextPtr rmesa = RADEON_CONTEXT( ctx );
+-
+-#if 0
+-   if (RADEON_DEBUG & DEBUG_VERTS) 
+-      _tnl_print_vert_flags( __FUNCTION__, newinputs );
+-#endif
++   r100ContextPtr rmesa = R100_CONTEXT( ctx );
++   int i;
+ 
+-   if (newinputs) 
+-     radeonReleaseDmaRegion( rmesa, &rmesa->tcl.indexed_verts, __FUNCTION__ );
++   for (i = 0; i < rmesa->tcl.nr_aos_components; i++) {
++      if (rmesa->tcl.aos[i].bo) {
++         radeon_bo_unref(rmesa->tcl.aos[i].bo);
++         rmesa->tcl.aos[i].bo = NULL;
++      }
++   }
+ }
+diff --git a/src/mesa/drivers/dri/radeon/radeon_mipmap_tree.c b/src/mesa/drivers/dri/radeon/radeon_mipmap_tree.c
+new file mode 100644
+index 0000000..3203ee1
+--- /dev/null
++++ b/src/mesa/drivers/dri/radeon/radeon_mipmap_tree.c
+@@ -0,0 +1,360 @@
++/*
++ * Copyright (C) 2008 Nicolai Haehnle.
++ *
++ * All Rights Reserved.
++ *
++ * Permission is hereby granted, free of charge, to any person obtaining
++ * a copy of this software and associated documentation files (the
++ * "Software"), to deal in the Software without restriction, including
++ * without limitation the rights to use, copy, modify, merge, publish,
++ * distribute, sublicense, and/or sell copies of the Software, and to
++ * permit persons to whom the Software is furnished to do so, subject to
++ * the following conditions:
++ *
++ * The above copyright notice and this permission notice (including the
++ * next paragraph) shall be included in all copies or substantial
++ * portions of the Software.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
++ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
++ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
++ * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
++ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
++ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
++ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
++ *
++ */
 +
-+	if (RADEON_DEBUG & DEBUG_VERTS)
-+		fprintf(stderr, "%s count %d stride %d out %p data %p\n",
-+			__FUNCTION__, count, stride, (void *)out, (void *)data);
++#include "radeon_mipmap_tree.h"
 +
-+	if (stride == 12) {
-+		COPY_DWORDS(out, data, count * 3);
-+    }
-+	else
-+		for (i = 0; i < count; i++) {
-+			out[0] = *(int *)data;
-+			out[1] = *(int *)(data + 4);
-+			out[2] = *(int *)(data + 8);
-+			out += 3;
-+			data += stride;
-+		}
-+}
++#include <errno.h>
++#include <unistd.h>
 +
-+static void radeonEmitVec16(uint32_t *out, GLvoid * data, int stride, int count)
++#include "main/simple_list.h"
++#include "main/texcompress.h"
++#include "main/texformat.h"
++
++static GLuint radeon_compressed_texture_size(GLcontext *ctx,
++		GLsizei width, GLsizei height, GLsizei depth,
++		GLuint mesaFormat)
 +{
-+	int i;
++	GLuint size = _mesa_compressed_texture_size(ctx, width, height, depth, mesaFormat);
 +
-+	if (RADEON_DEBUG & DEBUG_VERTS)
-+		fprintf(stderr, "%s count %d stride %d out %p data %p\n",
-+			__FUNCTION__, count, stride, (void *)out, (void *)data);
++	if (mesaFormat == MESA_FORMAT_RGB_DXT1 ||
++	    mesaFormat == MESA_FORMAT_RGBA_DXT1) {
++		if (width + 3 < 8)	/* width one block */
++			size = size * 4;
++		else if (width + 3 < 16)
++			size = size * 2;
++	} else {
++		/* DXT3/5, 16 bytes per block */
++	  //		WARN_ONCE("DXT 3/5 suffers from multitexturing problems!\n");
++		if (width + 3 < 8)
++			size = size * 2;
++	}
 +
-+	if (stride == 16)
-+		COPY_DWORDS(out, data, count * 4);
-+	else
-+		for (i = 0; i < count; i++) {
-+			out[0] = *(int *)data;
-+			out[1] = *(int *)(data + 4);
-+			out[2] = *(int *)(data + 8);
-+			out[3] = *(int *)(data + 12);
-+			out += 4;
-+			data += stride;
-+		}
++	return size;
 +}
 +
-+void rcommon_emit_vector(GLcontext * ctx, struct radeon_aos *aos,
-+			 GLvoid * data, int size, int stride, int count)
++
++static int radeon_compressed_num_bytes(GLuint mesaFormat)
++{
++   int bytes = 0;
++   switch(mesaFormat) {
++     
++   case MESA_FORMAT_RGB_FXT1:
++   case MESA_FORMAT_RGBA_FXT1:
++   case MESA_FORMAT_RGB_DXT1:
++   case MESA_FORMAT_RGBA_DXT1:
++     bytes = 2;
++     break;
++     
++   case MESA_FORMAT_RGBA_DXT3:
++   case MESA_FORMAT_RGBA_DXT5:
++     bytes = 4;
++   default:
++     break;
++   }
++   
++   return bytes;
++}
++
++/**
++ * Compute sizes and fill in offset and blit information for the given
++ * image (determined by \p face and \p level).
++ *
++ * \param curOffset points to the offset at which the image is to be stored
++ * and is updated by this function according to the size of the image.
++ */
++static void compute_tex_image_offset(radeon_mipmap_tree *mt,
++	GLuint face, GLuint level, GLuint* curOffset)
 +{
-+	radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
-+	uint32_t *out;
++	radeon_mipmap_level *lvl = &mt->levels[level];
 +
-+	if (stride == 0) {
-+		radeonAllocDmaRegion(rmesa, &aos->bo, &aos->offset, size * 4, 32);
-+		count = 1;
-+		aos->stride = 0;
++	/* Find image size in bytes */
++	if (mt->compressed) {
++		/* TODO: Is this correct? Need test cases for compressed textures! */
++		GLuint align;
++
++		lvl->rowstride = (lvl->width * mt->bpp + 63) & ~63;
++		lvl->size = radeon_compressed_texture_size(mt->radeon->glCtx,
++							   lvl->width, lvl->height, lvl->depth, mt->compressed);
++	} else if (mt->target == GL_TEXTURE_RECTANGLE_NV) {
++		lvl->rowstride = (lvl->width * mt->bpp + 63) & ~63;
++		lvl->size = lvl->rowstride * lvl->height;
++	} else if (mt->tilebits & RADEON_TXO_MICRO_TILE) {
++		/* tile pattern is 16 bytes x2. mipmaps stay 32 byte aligned,
++		 * though the actual offset may be different (if texture is less than
++		 * 32 bytes width) to the untiled case */
++		lvl->rowstride = (lvl->width * mt->bpp * 2 + 31) & ~31;
++		lvl->size = lvl->rowstride * ((lvl->height + 1) / 2) * lvl->depth;
 +	} else {
-+		radeonAllocDmaRegion(rmesa, &aos->bo, &aos->offset, size * count * 4, 32);
-+		aos->stride = size;
++		lvl->rowstride = (lvl->width * mt->bpp + 31) & ~31;
++		lvl->size = lvl->rowstride * lvl->height * lvl->depth;
 +	}
++	assert(lvl->size > 0);
 +
-+	aos->components = size;
-+	aos->count = count;
++	/* All images are aligned to a 32-byte offset */
++	*curOffset = (*curOffset + 0x1f) & ~0x1f;
++	lvl->faces[face].offset = *curOffset;
++	*curOffset += lvl->size;
 +
-+	out = (uint32_t*)((char*)aos->bo->ptr + aos->offset);
-+	switch (size) {
-+	case 1: radeonEmitVec4(out, data, stride, count); break;
-+	case 2: radeonEmitVec8(out, data, stride, count); break;
-+	case 3: radeonEmitVec12(out, data, stride, count); break;
-+	case 4: radeonEmitVec16(out, data, stride, count); break;
-+	default:
-+		assert(0);
-+		break;
-+	}
++	if (RADEON_DEBUG & DEBUG_TEXTURE)
++	  fprintf(stderr,
++		  "level %d, face %d: rs:%d %dx%d at %d\n",
++		  level, face, lvl->rowstride, lvl->width, lvl->height, lvl->faces[face].offset);
 +}
 +
-+void radeonRefillCurrentDmaRegion(radeonContextPtr rmesa, int size)
++static GLuint minify(GLuint size, GLuint levels)
 +{
-+	struct radeon_cs_space_check bos[1];
-+	int flushed = 0, ret;
-+
-+	size = MAX2(size, MAX_DMA_BUF_SZ * 16);
-+
-+	if (RADEON_DEBUG & (DEBUG_IOCTL | DEBUG_DMA))
-+		fprintf(stderr, "%s\n", __FUNCTION__);
++	size = size >> levels;
++	if (size < 1)
++		size = 1;
++	return size;
++}
 +
-+	if (rmesa->dma.flush) {
-+		rmesa->dma.flush(rmesa->glCtx);
-+	}
++static void calculate_miptree_layout(radeon_mipmap_tree *mt)
++{
++	GLuint curOffset;
++	GLuint numLevels;
++	GLuint i;
 +
-+	if (rmesa->dma.nr_released_bufs > 4) {
-+		rcommonFlushCmdBuf(rmesa, __FUNCTION__);
-+		rmesa->dma.nr_released_bufs = 0;
-+	}
++	numLevels = mt->lastLevel - mt->firstLevel + 1;
++	assert(numLevels <= RADEON_MAX_TEXTURE_LEVELS);
 +
-+	if (rmesa->dma.current) {
-+		radeon_bo_unmap(rmesa->dma.current);
-+		radeon_bo_unref(rmesa->dma.current);
-+		rmesa->dma.current = 0;
-+	}
++	curOffset = 0;
++	for(i = 0; i < numLevels; i++) {
++		GLuint face;
 +
-+again_alloc:	
-+	rmesa->dma.current = radeon_bo_open(rmesa->radeonScreen->bom,
-+					    0, size, 4, RADEON_GEM_DOMAIN_GTT,
-+					    0);
++		mt->levels[i].width = minify(mt->width0, i);
++		mt->levels[i].height = minify(mt->height0, i);
++		mt->levels[i].depth = minify(mt->depth0, i);
 +
-+	if (!rmesa->dma.current) {
-+		rcommonFlushCmdBuf(rmesa, __FUNCTION__);
-+		rmesa->dma.nr_released_bufs = 0;
-+		goto again_alloc;
++		for(face = 0; face < mt->faces; face++)
++			compute_tex_image_offset(mt, face, i, &curOffset);
 +	}
 +
-+	rmesa->dma.current_used = 0;
-+	rmesa->dma.current_vertexptr = 0;
-+	
-+	bos[0].bo = rmesa->dma.current;
-+	bos[0].read_domains = RADEON_GEM_DOMAIN_GTT;
-+	bos[0].write_domain =0 ;
-+	bos[0].new_accounted = 0;
-+
-+	ret = radeon_cs_space_check(rmesa->cmdbuf.cs, bos, 1);
-+	if (ret == RADEON_CS_SPACE_OP_TO_BIG) {
-+		fprintf(stderr,"Got OPEARTION TO BIG ILLEGAL - this cannot happen");
-+		assert(0);
-+	} else if (ret == RADEON_CS_SPACE_FLUSH) {
-+		rcommonFlushCmdBuf(rmesa, __FUNCTION__);
-+		if (flushed) {
-+			fprintf(stderr,"flushed but still no space\n");
-+			assert(0);
-+		}
-+		flushed = 1;
-+		goto again_alloc;
-+	}
-+	radeon_bo_map(rmesa->dma.current, 1);
++	/* Note the required size in memory */
++	mt->totalsize = (curOffset + RADEON_OFFSET_MASK) & ~RADEON_OFFSET_MASK;
 +}
 +
-+/* Allocates a region from rmesa->dma.current.  If there isn't enough
-+ * space in current, grab a new buffer (and discard what was left of current)
++
++/**
++ * Create a new mipmap tree, calculate its layout and allocate memory.
 + */
-+void radeonAllocDmaRegion(radeonContextPtr rmesa,
-+			  struct radeon_bo **pbo, int *poffset,
-+			  int bytes, int alignment)
++radeon_mipmap_tree* radeon_miptree_create(radeonContextPtr rmesa, radeonTexObj *t,
++		GLenum target, GLuint firstLevel, GLuint lastLevel,
++		GLuint width0, GLuint height0, GLuint depth0,
++		GLuint bpp, GLuint tilebits, GLuint compressed)
 +{
-+	if (RADEON_DEBUG & DEBUG_IOCTL)
-+		fprintf(stderr, "%s %d\n", __FUNCTION__, bytes);
++	radeon_mipmap_tree *mt = CALLOC_STRUCT(_radeon_mipmap_tree);
 +
-+	if (rmesa->dma.flush)
-+		rmesa->dma.flush(rmesa->glCtx);
++	mt->radeon = rmesa;
++	mt->refcount = 1;
++	mt->t = t;
++	mt->target = target;
++	mt->faces = (target == GL_TEXTURE_CUBE_MAP) ? 6 : 1;
++	mt->firstLevel = firstLevel;
++	mt->lastLevel = lastLevel;
++	mt->width0 = width0;
++	mt->height0 = height0;
++	mt->depth0 = depth0;
++	mt->bpp = compressed ? radeon_compressed_num_bytes(compressed) : bpp;
++	mt->tilebits = tilebits;
++	mt->compressed = compressed;
 +
-+	assert(rmesa->dma.current_used == rmesa->dma.current_vertexptr);
++	calculate_miptree_layout(mt);
 +
-+	alignment--;
-+	rmesa->dma.current_used = (rmesa->dma.current_used + alignment) & ~alignment;
++	mt->bo = radeon_bo_open(rmesa->radeonScreen->bom,
++                            0, mt->totalsize, 1024,
++                            RADEON_GEM_DOMAIN_VRAM,
++                            0);
 +
-+	if (!rmesa->dma.current || rmesa->dma.current_used + bytes > rmesa->dma.current->size)
-+		radeonRefillCurrentDmaRegion(rmesa, (bytes + 15) & ~15);
++	return mt;
++}
 +
-+	*poffset = rmesa->dma.current_used;
-+	*pbo = rmesa->dma.current;
-+	radeon_bo_ref(*pbo);
++void radeon_miptree_reference(radeon_mipmap_tree *mt)
++{
++	mt->refcount++;
++	assert(mt->refcount > 0);
++}
 +
-+	/* Always align to at least 16 bytes */
-+	rmesa->dma.current_used = (rmesa->dma.current_used + bytes + 15) & ~15;
-+	rmesa->dma.current_vertexptr = rmesa->dma.current_used;
++void radeon_miptree_unreference(radeon_mipmap_tree *mt)
++{
++	if (!mt)
++		return;
 +
-+	assert(rmesa->dma.current_used <= rmesa->dma.current->size);
++	assert(mt->refcount > 0);
++	mt->refcount--;
++	if (!mt->refcount) {
++		radeon_bo_unref(mt->bo);
++		free(mt);
++	}
 +}
 +
-+void radeonReleaseDmaRegion(radeonContextPtr rmesa)
++
++/**
++ * Calculate first and last mip levels for the given texture object,
++ * where the dimensions are taken from the given texture image at
++ * the given level.
++ *
++ * Note: level is the OpenGL level number, which is not necessarily the same
++ * as the first level that is actually present.
++ *
++ * The base level image of the given texture face must be non-null,
++ * or this will fail.
++ */
++static void calculate_first_last_level(struct gl_texture_object *tObj,
++				       GLuint *pfirstLevel, GLuint *plastLevel,
++				       GLuint face, GLuint level)
 +{
-+	if (RADEON_DEBUG & DEBUG_IOCTL)
-+		fprintf(stderr, "%s %p\n", __FUNCTION__, rmesa->dma.current);
-+	if (rmesa->dma.current) {
-+		rmesa->dma.nr_released_bufs++;
-+		radeon_bo_unmap(rmesa->dma.current);
-+	        radeon_bo_unref(rmesa->dma.current);
++	const struct gl_texture_image * const baseImage =
++		tObj->Image[face][level];
++
++	assert(baseImage);
++	
++	/* These must be signed values.  MinLod and MaxLod can be negative numbers,
++	* and having firstLevel and lastLevel as signed prevents the need for
++	* extra sign checks.
++	*/
++	int   firstLevel;
++	int   lastLevel;
++
++	/* Yes, this looks overly complicated, but it's all needed.
++	*/
++	switch (tObj->Target) {
++	case GL_TEXTURE_1D:
++	case GL_TEXTURE_2D:
++	case GL_TEXTURE_3D:
++	case GL_TEXTURE_CUBE_MAP:
++		if (tObj->MinFilter == GL_NEAREST || tObj->MinFilter == GL_LINEAR) {
++			/* GL_NEAREST and GL_LINEAR only care about GL_TEXTURE_BASE_LEVEL.
++			*/
++			firstLevel = lastLevel = tObj->BaseLevel;
++		} else {
++			firstLevel = tObj->BaseLevel + (GLint)(tObj->MinLod + 0.5);
++			firstLevel = MAX2(firstLevel, tObj->BaseLevel);
++			firstLevel = MIN2(firstLevel, level + baseImage->MaxLog2);
++			lastLevel = tObj->BaseLevel + (GLint)(tObj->MaxLod + 0.5);
++			lastLevel = MAX2(lastLevel, tObj->BaseLevel);
++			lastLevel = MIN2(lastLevel, level + baseImage->MaxLog2);
++			lastLevel = MIN2(lastLevel, tObj->MaxLevel);
++			lastLevel = MAX2(firstLevel, lastLevel); /* need at least one level */
++		}
++		break;
++	case GL_TEXTURE_RECTANGLE_NV:
++	case GL_TEXTURE_4D_SGIS:
++		firstLevel = lastLevel = 0;
++		break;
++	default:
++		return;
 +	}
-+	rmesa->dma.current = NULL;
++
++	/* save these values */
++	*pfirstLevel = firstLevel;
++	*plastLevel = lastLevel;
 +}
 +
 +
-+/* Flush vertices in the current dma region.
++/**
++ * Checks whether the given miptree can hold the given texture image at the
++ * given face and level.
 + */
-+void rcommon_flush_last_swtcl_prim( GLcontext *ctx  )
++GLboolean radeon_miptree_matches_image(radeon_mipmap_tree *mt,
++		struct gl_texture_image *texImage, GLuint face, GLuint level)
 +{
-+	radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
-+	struct radeon_dma *dma = &rmesa->dma;
-+		
++	radeon_mipmap_level *lvl;
 +
-+	if (RADEON_DEBUG & DEBUG_IOCTL)
-+		fprintf(stderr, "%s\n", __FUNCTION__);
-+	dma->flush = NULL;
++	if (face >= mt->faces || level < mt->firstLevel || level > mt->lastLevel)
++		return GL_FALSE;
 +
-+	if (dma->current) {
-+	    GLuint current_offset = dma->current_used;
++	if (texImage->IsCompressed != mt->compressed)
++		return GL_FALSE;
 +
-+	    assert (dma->current_used +
-+		    rmesa->swtcl.numverts * rmesa->swtcl.vertex_size * 4 ==
-+		    dma->current_vertexptr);
++	if (!texImage->IsCompressed &&
++	    !mt->compressed &&
++	    texImage->TexFormat->TexelBytes != mt->bpp)
++		return GL_FALSE;
 +
-+	    if (dma->current_used != dma->current_vertexptr) {
-+		    dma->current_used = dma->current_vertexptr;
++	lvl = &mt->levels[level - mt->firstLevel];
++	if (lvl->width != texImage->Width ||
++	    lvl->height != texImage->Height ||
++	    lvl->depth != texImage->Depth)
++		return GL_FALSE;
 +
-+		    rmesa->vtbl.swtcl_flush(ctx, current_offset);
-+	    }
-+	    rmesa->swtcl.numverts = 0;
-+	}
++	return GL_TRUE;
 +}
-+/* Alloc space in the current dma region.
++
++
++/**
++ * Checks whether the given miptree has the right format to store the given texture object.
 + */
-+void *
-+rcommonAllocDmaLowVerts( radeonContextPtr rmesa, int nverts, int vsize )
++GLboolean radeon_miptree_matches_texture(radeon_mipmap_tree *mt, struct gl_texture_object *texObj)
 +{
-+	GLuint bytes = vsize * nverts;
-+	void *head;
-+
-+	if (!rmesa->dma.current || rmesa->dma.current_vertexptr + bytes > rmesa->dma.current->size) {
-+                radeonRefillCurrentDmaRegion(rmesa, bytes);
-+	}
++	struct gl_texture_image *firstImage;
++	GLuint compressed;
++	GLuint numfaces = 1;
++	GLuint firstLevel, lastLevel;
 +
-+        if (!rmesa->dma.flush) {
-+                rmesa->glCtx->Driver.NeedFlush |= FLUSH_STORED_VERTICES;
-+                rmesa->dma.flush = rcommon_flush_last_swtcl_prim;
-+        }
++	calculate_first_last_level(texObj, &firstLevel, &lastLevel, 0, texObj->BaseLevel);
++	if (texObj->Target == GL_TEXTURE_CUBE_MAP)
++		numfaces = 6;
 +
-+	ASSERT( vsize == rmesa->swtcl.vertex_size * 4 );
-+        ASSERT( rmesa->dma.flush == rcommon_flush_last_swtcl_prim );
-+        ASSERT( rmesa->dma.current_used +
-+                rmesa->swtcl.numverts * rmesa->swtcl.vertex_size * 4 ==
-+                rmesa->dma.current_vertexptr );
++	firstImage = texObj->Image[0][firstLevel];
++	compressed = firstImage->IsCompressed ? firstImage->TexFormat->MesaFormat : 0;
 +
-+	head = (rmesa->dma.current->ptr + rmesa->dma.current_vertexptr);
-+	rmesa->dma.current_vertexptr += bytes;
-+	rmesa->swtcl.numverts += nverts;
-+	return head;
++	return (mt->firstLevel == firstLevel &&
++	        mt->lastLevel == lastLevel &&
++	        mt->width0 == firstImage->Width &&
++	        mt->height0 == firstImage->Height &&
++	        mt->depth0 == firstImage->Depth &&
++	        mt->bpp == firstImage->TexFormat->TexelBytes &&
++	        mt->compressed == compressed);
 +}
-diff --git a/src/mesa/drivers/dri/radeon/radeon_dma.h b/src/mesa/drivers/dri/radeon/radeon_dma.h
-new file mode 100644
-index 0000000..cee3744
---- /dev/null
-+++ b/src/mesa/drivers/dri/radeon/radeon_dma.h
-@@ -0,0 +1,51 @@
-+/**************************************************************************
-+
-+Copyright (C) 2004 Nicolai Haehnle.
-+Copyright (C) The Weather Channel, Inc.  2002.  All Rights Reserved.
-+
-+The Weather Channel (TM) funded Tungsten Graphics to develop the
-+initial release of the Radeon 8500 driver under the XFree86 license.
-+This notice must be preserved.
-+
-+All Rights Reserved.
-+
-+Permission is hereby granted, free of charge, to any person obtaining a
-+copy of this software and associated documentation files (the "Software"),
-+to deal in the Software without restriction, including without limitation
-+on the rights to use, copy, modify, merge, publish, distribute, sub
-+license, and/or sell copies of the Software, and to permit persons to whom
-+the Software is furnished to do so, subject to the following conditions:
-+
-+The above copyright notice and this permission notice (including the next
-+paragraph) shall be included in all copies or substantial portions of the
-+Software.
-+
-+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-+FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
-+ATI, VA LINUX SYSTEMS AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
-+DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
-+OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
-+USE OR OTHER DEALINGS IN THE SOFTWARE.
-+
-+**************************************************************************/
 +
-+#ifndef RADEON_DMA_H
-+#define RADEON_DMA_H
 +
-+void radeonEmitVec8(uint32_t *out, GLvoid * data, int stride, int count);
-+void radeonEmitVec12(uint32_t *out, GLvoid * data, int stride, int count);
++/**
++ * Try to allocate a mipmap tree for the given texture that will fit the
++ * given image in the given position.
++ */
++void radeon_try_alloc_miptree(radeonContextPtr rmesa, radeonTexObj *t,
++		struct gl_texture_image *texImage, GLuint face, GLuint level)
++{
++	GLuint compressed = texImage->IsCompressed ? texImage->TexFormat->MesaFormat : 0;
++	GLuint numfaces = 1;
++	GLuint firstLevel, lastLevel;
 +
-+void rcommon_emit_vector(GLcontext * ctx, struct radeon_aos *aos,
-+			 GLvoid * data, int size, int stride, int count);
++	assert(!t->mt);
 +
-+void radeonRefillCurrentDmaRegion(radeonContextPtr rmesa, int size);
-+void radeonAllocDmaRegion(radeonContextPtr rmesa,
-+			  struct radeon_bo **pbo, int *poffset,
-+			  int bytes, int alignment);
-+void radeonReleaseDmaRegion(radeonContextPtr rmesa);
++	calculate_first_last_level(&t->base, &firstLevel, &lastLevel, face, level);
++	if (t->base.Target == GL_TEXTURE_CUBE_MAP)
++		numfaces = 6;
 +
-+void rcommon_flush_last_swtcl_prim(GLcontext *ctx);
++	if (level != firstLevel || face >= numfaces)
++		return;
 +
-+void *rcommonAllocDmaLowVerts(radeonContextPtr rmesa, int nverts, int vsize);
-+#endif
-diff --git a/src/mesa/drivers/dri/radeon/radeon_mipmap_tree.c b/src/mesa/drivers/dri/radeon/radeon_mipmap_tree.c
++	t->mt = radeon_miptree_create(rmesa, t, t->base.Target,
++		firstLevel, lastLevel,
++		texImage->Width, texImage->Height, texImage->Depth,
++		texImage->TexFormat->TexelBytes, t->tile_bits, compressed);
++}
+diff --git a/src/mesa/drivers/dri/radeon/radeon_mipmap_tree.h b/src/mesa/drivers/dri/radeon/radeon_mipmap_tree.h
 new file mode 100644
-index 0000000..3203ee1
+index 0000000..43dfa48
 --- /dev/null
-+++ b/src/mesa/drivers/dri/radeon/radeon_mipmap_tree.c
-@@ -0,0 +1,360 @@
++++ b/src/mesa/drivers/dri/radeon/radeon_mipmap_tree.h
+@@ -0,0 +1,97 @@
 +/*
 + * Copyright (C) 2008 Nicolai Haehnle.
 + *
@@ -4549,442 +28260,5955 @@ index 0000000..3203ee1
 + *
 + */
 +
-+#include "radeon_mipmap_tree.h"
++#ifndef __RADEON_MIPMAP_TREE_H_
++#define __RADEON_MIPMAP_TREE_H_
++
++#include "radeon_common.h"
++
++typedef struct _radeon_mipmap_tree radeon_mipmap_tree;
++typedef struct _radeon_mipmap_level radeon_mipmap_level;
++typedef struct _radeon_mipmap_image radeon_mipmap_image;
++
++struct _radeon_mipmap_image {
++	GLuint offset; /** Offset of this image from the start of mipmap tree buffer, in bytes */
++};
++
++struct _radeon_mipmap_level {
++	GLuint width;
++	GLuint height;
++	GLuint depth;
++	GLuint size; /** Size of each image, in bytes */
++	GLuint rowstride; /** in bytes */
++	radeon_mipmap_image faces[6];
++};
++
++
++/**
++ * A mipmap tree contains texture images in the layout that the hardware
++ * expects.
++ *
++ * The meta-data of mipmap trees is immutable, i.e. you cannot change the
++ * layout on-the-fly; however, the texture contents (i.e. texels) can be
++ * changed.
++ */
++struct _radeon_mipmap_tree {
++	radeonContextPtr radeon;
++	radeonTexObj *t;
++	struct radeon_bo *bo;
++	GLuint refcount;
++
++	GLuint totalsize; /** total size of the miptree, in bytes */
++
++	GLenum target; /** GL_TEXTURE_xxx */
++	GLuint faces; /** # of faces: 6 for cubemaps, 1 otherwise */
++	GLuint firstLevel; /** First mip level stored in this mipmap tree */
++	GLuint lastLevel; /** Last mip level stored in this mipmap tree */
++
++	GLuint width0; /** Width of firstLevel image */
++	GLuint height0; /** Height of firstLevel image */
++	GLuint depth0; /** Depth of firstLevel image */
++
++	GLuint bpp; /** Bytes per texel */
++	GLuint tilebits; /** RADEON_TXO_xxx_TILE */
++	GLuint compressed; /** MESA_FORMAT_xxx indicating a compressed format, or 0 if uncompressed */
++
++	radeon_mipmap_level levels[RADEON_MAX_TEXTURE_LEVELS];
++};
++
++radeon_mipmap_tree* radeon_miptree_create(radeonContextPtr rmesa, radeonTexObj *t,
++		GLenum target, GLuint firstLevel, GLuint lastLevel,
++		GLuint width0, GLuint height0, GLuint depth0,
++		GLuint bpp, GLuint tilebits, GLuint compressed);
++void radeon_miptree_reference(radeon_mipmap_tree *mt);
++void radeon_miptree_unreference(radeon_mipmap_tree *mt);
++
++GLboolean radeon_miptree_matches_image(radeon_mipmap_tree *mt,
++		struct gl_texture_image *texImage, GLuint face, GLuint level);
++GLboolean radeon_miptree_matches_texture(radeon_mipmap_tree *mt, struct gl_texture_object *texObj);
++void radeon_try_alloc_miptree(radeonContextPtr rmesa, radeonTexObj *t,
++			      struct gl_texture_image *texImage, GLuint face, GLuint level);
++
++
++#endif /* __RADEON_MIPMAP_TREE_H_ */
+diff --git a/src/mesa/drivers/dri/radeon/radeon_sanity.c b/src/mesa/drivers/dri/radeon/radeon_sanity.c
+index 6613757..bbed838 100644
+--- a/src/mesa/drivers/dri/radeon/radeon_sanity.c
++++ b/src/mesa/drivers/dri/radeon/radeon_sanity.c
+@@ -973,7 +973,7 @@ static int radeon_emit_packet3_cliprect( drm_radeon_cmd_buffer_t *cmdbuf )
+ }
+ 
+ 
+-int radeonSanityCmdBuffer( radeonContextPtr rmesa,
++int radeonSanityCmdBuffer( r100ContextPtr rmesa,
+ 			   int nbox,
+ 			   drm_clip_rect_t *boxes )
+ {
+diff --git a/src/mesa/drivers/dri/radeon/radeon_sanity.h b/src/mesa/drivers/dri/radeon/radeon_sanity.h
+index 1ec06bc..f30eb1c 100644
+--- a/src/mesa/drivers/dri/radeon/radeon_sanity.h
++++ b/src/mesa/drivers/dri/radeon/radeon_sanity.h
+@@ -1,7 +1,7 @@
+ #ifndef RADEON_SANITY_H
+ #define RADEON_SANITY_H
+ 
+-extern int radeonSanityCmdBuffer( radeonContextPtr rmesa,
++extern int radeonSanityCmdBuffer( r100ContextPtr rmesa,
+ 				  int nbox,
+ 				  drm_clip_rect_t *boxes );
+ 
+diff --git a/src/mesa/drivers/dri/radeon/radeon_screen.c b/src/mesa/drivers/dri/radeon/radeon_screen.c
+index e3afaa9..c591e9f 100644
+--- a/src/mesa/drivers/dri/radeon/radeon_screen.c
++++ b/src/mesa/drivers/dri/radeon/radeon_screen.c
+@@ -35,6 +35,7 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+  * \author  Gareth Hughes <gareth@valinux.com>
+  */
+ 
++#include <errno.h>
+ #include "main/glheader.h"
+ #include "main/imports.h"
+ #include "main/mtypes.h"
+@@ -45,32 +46,39 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ #include "radeon_chipset.h"
+ #include "radeon_macros.h"
+ #include "radeon_screen.h"
++#include "radeon_common.h"
++#include "radeon_span.h"
+ #if !RADEON_COMMON
+ #include "radeon_context.h"
+-#include "radeon_span.h"
+ #include "radeon_tex.h"
+ #elif RADEON_COMMON && defined(RADEON_COMMON_FOR_R200)
+ #include "r200_context.h"
+ #include "r200_ioctl.h"
+-#include "r200_span.h"
+ #include "r200_tex.h"
+ #elif RADEON_COMMON && defined(RADEON_COMMON_FOR_R300)
+ #include "r300_context.h"
+ #include "r300_fragprog.h"
+ #include "r300_tex.h"
+-#include "radeon_span.h"
+ #endif
+ 
+ #include "utils.h"
+ #include "vblank.h"
+ #include "drirenderbuffer.h"
+ 
++#include "radeon_bocs_wrapper.h"
++
+ #include "GL/internal/dri_interface.h"
+ 
+ /* Radeon configuration
+  */
+ #include "xmlpool.h"
+ 
++#define DRI_CONF_COMMAND_BUFFER_SIZE(def,min,max) \
++DRI_CONF_OPT_BEGIN_V(command_buffer_size,int,def, # min ":" # max ) \
++        DRI_CONF_DESC(en,"Size of command buffer (in KB)") \
++        DRI_CONF_DESC(de,"Grösse des Befehlspuffers (in KB)") \
++DRI_CONF_OPT_END
++
+ #if !RADEON_COMMON	/* R100 */
+ PUBLIC const char __driConfigOptions[] =
+ DRI_CONF_BEGIN
+@@ -80,6 +88,7 @@ DRI_CONF_BEGIN
+         DRI_CONF_VBLANK_MODE(DRI_CONF_VBLANK_DEF_INTERVAL_0)
+         DRI_CONF_MAX_TEXTURE_UNITS(3,2,3)
+         DRI_CONF_HYPERZ(false)
++        DRI_CONF_COMMAND_BUFFER_SIZE(8, 8, 32)
+     DRI_CONF_SECTION_END
+     DRI_CONF_SECTION_QUALITY
+         DRI_CONF_TEXTURE_DEPTH(DRI_CONF_TEXTURE_DEPTH_FB)
+@@ -95,7 +104,7 @@ DRI_CONF_BEGIN
+         DRI_CONF_NO_RAST(false)
+     DRI_CONF_SECTION_END
+ DRI_CONF_END;
+-static const GLuint __driNConfigOptions = 14;
++static const GLuint __driNConfigOptions = 15;
+ 
+ #elif RADEON_COMMON && defined(RADEON_COMMON_FOR_R200)
+ 
+@@ -107,6 +116,7 @@ DRI_CONF_BEGIN
+         DRI_CONF_VBLANK_MODE(DRI_CONF_VBLANK_DEF_INTERVAL_0)
+         DRI_CONF_MAX_TEXTURE_UNITS(6,2,6)
+         DRI_CONF_HYPERZ(false)
++        DRI_CONF_COMMAND_BUFFER_SIZE(8, 8, 32)
+     DRI_CONF_SECTION_END
+     DRI_CONF_SECTION_QUALITY
+         DRI_CONF_TEXTURE_DEPTH(DRI_CONF_TEXTURE_DEPTH_FB)
+@@ -126,7 +136,7 @@ DRI_CONF_BEGIN
+         DRI_CONF_NV_VERTEX_PROGRAM(false)
+     DRI_CONF_SECTION_END
+ DRI_CONF_END;
+-static const GLuint __driNConfigOptions = 16;
++static const GLuint __driNConfigOptions = 17;
+ 
+ extern const struct dri_extension blend_extensions[];
+ extern const struct dri_extension ARB_vp_extension[];
+@@ -149,11 +159,7 @@ DRI_CONF_OPT_BEGIN_V(texture_coord_units,int,def, # min ":" # max ) \
+         DRI_CONF_DESC(de,"Anzahl der Texturkoordinateneinheiten") \
+ DRI_CONF_OPT_END
+ 
+-#define DRI_CONF_COMMAND_BUFFER_SIZE(def,min,max) \
+-DRI_CONF_OPT_BEGIN_V(command_buffer_size,int,def, # min ":" # max ) \
+-        DRI_CONF_DESC(en,"Size of command buffer (in KB)") \
+-        DRI_CONF_DESC(de,"Grösse des Befehlspuffers (in KB)") \
+-DRI_CONF_OPT_END
++
+ 
+ #define DRI_CONF_DISABLE_S3TC(def) \
+ DRI_CONF_OPT_BEGIN(disable_s3tc,bool,def) \
+@@ -206,8 +212,9 @@ DRI_CONF_BEGIN
+ DRI_CONF_END;
+ static const GLuint __driNConfigOptions = 17;
+ 
++extern const struct dri_extension gl_20_extension[];
++
+ #ifndef RADEON_DEBUG
+-int RADEON_DEBUG = 0;
+ 
+ static const struct dri_debug_control debug_control[] = {
+ 	{"fall", DEBUG_FALLBACKS},
+@@ -349,137 +356,17 @@ static const __DRItexOffsetExtension r300texOffsetExtension = {
+     { __DRI_TEX_OFFSET, __DRI_TEX_OFFSET_VERSION },
+    r300SetTexOffset,
+ };
+-#endif
+-
+-/* Create the device specific screen private data struct.
+- */
+-static radeonScreenPtr
+-radeonCreateScreen( __DRIscreenPrivate *sPriv )
+-{
+-   radeonScreenPtr screen;
+-   RADEONDRIPtr dri_priv = (RADEONDRIPtr)sPriv->pDevPriv;
+-   unsigned char *RADEONMMIO;
+-   int i;
+-   int ret;
+-   uint32_t temp;
+-
+-   if (sPriv->devPrivSize != sizeof(RADEONDRIRec)) {
+-      fprintf(stderr,"\nERROR!  sizeof(RADEONDRIRec) does not match passed size from device driver\n");
+-      return GL_FALSE;
+-   }
+ 
+-   /* Allocate the private area */
+-   screen = (radeonScreenPtr) CALLOC( sizeof(*screen) );
+-   if ( !screen ) {
+-      __driUtilMessage("%s: Could not allocate memory for screen structure",
+-		       __FUNCTION__);
+-      return NULL;
+-   }
+-
+-#if DO_DEBUG && RADEON_COMMON && defined(RADEON_COMMON_FOR_R300)
+-	RADEON_DEBUG = driParseDebugString(getenv("RADEON_DEBUG"), debug_control);
++static const __DRItexBufferExtension r300TexBufferExtension = {
++    { __DRI_TEX_BUFFER, __DRI_TEX_BUFFER_VERSION },
++   r300SetTexBuffer,
++};
+ #endif
+ 
+-   /* parse information in __driConfigOptions */
+-   driParseOptionInfo (&screen->optionCache,
+-		       __driConfigOptions, __driNConfigOptions);
+-
+-   /* This is first since which regions we map depends on whether or
+-    * not we are using a PCI card.
+-    */
+-   screen->card_type = (dri_priv->IsPCI ? RADEON_CARD_PCI : RADEON_CARD_AGP);
+-   {
+-      int ret;
+-      ret = radeonGetParam( sPriv->fd, RADEON_PARAM_GART_BUFFER_OFFSET,
+-			    &screen->gart_buffer_offset);
+-
+-      if (ret) {
+-	 FREE( screen );
+-	 fprintf(stderr, "drm_radeon_getparam_t (RADEON_PARAM_GART_BUFFER_OFFSET): %d\n", ret);
+-	 return NULL;
+-      }
+-
+-      ret = radeonGetParam( sPriv->fd, RADEON_PARAM_GART_BASE,
+-			    &screen->gart_base);
+-      if (ret) {
+-	 FREE( screen );
+-	 fprintf(stderr, "drm_radeon_getparam_t (RADEON_PARAM_GART_BASE): %d\n", ret);
+-	 return NULL;
+-      }
+-
+-      ret = radeonGetParam( sPriv->fd, RADEON_PARAM_IRQ_NR,
+-			    &screen->irq);
+-      if (ret) {
+-	 FREE( screen );
+-	 fprintf(stderr, "drm_radeon_getparam_t (RADEON_PARAM_IRQ_NR): %d\n", ret);
+-	 return NULL;
+-      }
+-      screen->drmSupportsCubeMapsR200 = (sPriv->drm_version.minor >= 7);
+-      screen->drmSupportsBlendColor = (sPriv->drm_version.minor >= 11);
+-      screen->drmSupportsTriPerf = (sPriv->drm_version.minor >= 16);
+-      screen->drmSupportsFragShader = (sPriv->drm_version.minor >= 18);
+-      screen->drmSupportsPointSprites = (sPriv->drm_version.minor >= 13);
+-      screen->drmSupportsCubeMapsR100 = (sPriv->drm_version.minor >= 15);
+-      screen->drmSupportsVertexProgram = (sPriv->drm_version.minor >= 25);
+-   }
+-
+-   screen->mmio.handle = dri_priv->registerHandle;
+-   screen->mmio.size   = dri_priv->registerSize;
+-   if ( drmMap( sPriv->fd,
+-		screen->mmio.handle,
+-		screen->mmio.size,
+-		&screen->mmio.map ) ) {
+-      FREE( screen );
+-      __driUtilMessage("%s: drmMap failed\n", __FUNCTION__ );
+-      return NULL;
+-   }
+-
+-   RADEONMMIO = screen->mmio.map;
+-
+-   screen->status.handle = dri_priv->statusHandle;
+-   screen->status.size   = dri_priv->statusSize;
+-   if ( drmMap( sPriv->fd,
+-		screen->status.handle,
+-		screen->status.size,
+-		&screen->status.map ) ) {
+-      drmUnmap( screen->mmio.map, screen->mmio.size );
+-      FREE( screen );
+-      __driUtilMessage("%s: drmMap (2) failed\n", __FUNCTION__ );
+-      return NULL;
+-   }
+-   screen->scratch = (__volatile__ uint32_t *)
+-      ((GLubyte *)screen->status.map + RADEON_SCRATCH_REG_OFFSET);
+-
+-   screen->buffers = drmMapBufs( sPriv->fd );
+-   if ( !screen->buffers ) {
+-      drmUnmap( screen->status.map, screen->status.size );
+-      drmUnmap( screen->mmio.map, screen->mmio.size );
+-      FREE( screen );
+-      __driUtilMessage("%s: drmMapBufs failed\n", __FUNCTION__ );
+-      return NULL;
+-   }
+-
+-   if ( dri_priv->gartTexHandle && dri_priv->gartTexMapSize ) {
+-      screen->gartTextures.handle = dri_priv->gartTexHandle;
+-      screen->gartTextures.size   = dri_priv->gartTexMapSize;
+-      if ( drmMap( sPriv->fd,
+-		   screen->gartTextures.handle,
+-		   screen->gartTextures.size,
+-		   (drmAddressPtr)&screen->gartTextures.map ) ) {
+-	 drmUnmapBufs( screen->buffers );
+-	 drmUnmap( screen->status.map, screen->status.size );
+-	 drmUnmap( screen->mmio.map, screen->mmio.size );
+-	 FREE( screen );
+-	 __driUtilMessage("%s: drmMap failed for GART texture area\n", __FUNCTION__);
+-	 return NULL;
+-      }
+-
+-      screen->gart_texture_offset = dri_priv->gartTexOffset + screen->gart_base;
+-   }
+-
++static int radeon_set_screen_flags(radeonScreenPtr screen, int device_id)
++{
+    screen->chip_flags = 0;
+-   /* XXX: add more chipsets */
+-   switch ( dri_priv->deviceID ) {
++   switch ( device_id ) {
+    case PCI_CHIP_RADEON_LY:
+    case PCI_CHIP_RADEON_LZ:
+    case PCI_CHIP_RADEON_QY:
+@@ -683,12 +570,6 @@ radeonCreateScreen( __DRIscreenPrivate *sPriv )
+       screen->chip_family = CHIP_FAMILY_RS400;
+       break;
+ 
+-   case PCI_CHIP_RS600_793F:
+-   case PCI_CHIP_RS600_7941:
+-   case PCI_CHIP_RS600_7942:
+-      screen->chip_family = CHIP_FAMILY_RS600;
+-      break;
+-
+    case PCI_CHIP_RS690_791E:
+    case PCI_CHIP_RS690_791F:
+       screen->chip_family = CHIP_FAMILY_RS690;
+@@ -817,9 +698,162 @@ radeonCreateScreen( __DRIscreenPrivate *sPriv )
+ 
+    default:
+       fprintf(stderr, "unknown chip id 0x%x, can't guess.\n",
+-	      dri_priv->deviceID);
++	      device_id);
++      return -1;
++   }
++
++   return 0;
++}
++
++
++/* Create the device specific screen private data struct.
++ */
++static radeonScreenPtr
++radeonCreateScreen( __DRIscreenPrivate *sPriv )
++{
++   radeonScreenPtr screen;
++   RADEONDRIPtr dri_priv = (RADEONDRIPtr)sPriv->pDevPriv;
++   unsigned char *RADEONMMIO = NULL;
++   int i;
++   int ret;
++   uint32_t temp;
++
++   if (sPriv->devPrivSize != sizeof(RADEONDRIRec)) {
++      fprintf(stderr,"\nERROR!  sizeof(RADEONDRIRec) does not match passed size from device driver\n");
++      return GL_FALSE;
++   }
++
++   /* Allocate the private area */
++   screen = (radeonScreenPtr) CALLOC( sizeof(*screen) );
++   if ( !screen ) {
++      __driUtilMessage("%s: Could not allocate memory for screen structure",
++		       __FUNCTION__);
+       return NULL;
+    }
++
++#if DO_DEBUG && RADEON_COMMON && defined(RADEON_COMMON_FOR_R300)
++	RADEON_DEBUG = driParseDebugString(getenv("RADEON_DEBUG"), debug_control);
++#endif
++
++   /* parse information in __driConfigOptions */
++   driParseOptionInfo (&screen->optionCache,
++		       __driConfigOptions, __driNConfigOptions);
++
++   /* This is first since which regions we map depends on whether or
++    * not we are using a PCI card.
++    */
++   screen->card_type = (dri_priv->IsPCI ? RADEON_CARD_PCI : RADEON_CARD_AGP);
++   {
++      int ret;
++
++#ifdef RADEON_PARAM_KERNEL_MM
++     ret = radeonGetParam( sPriv->fd, RADEON_PARAM_KERNEL_MM,
++                            &screen->kernel_mm);
++
++      if (ret && ret != -EINVAL) {
++         FREE( screen );
++         fprintf(stderr, "drm_radeon_getparam_t (RADEON_OFFSET): %d\n", ret);
++         return NULL;
++      }
++
++      if (ret == -EINVAL)
++          screen->kernel_mm = 0;
++#endif
 +
-+#include <errno.h>
-+#include <unistd.h>
++      ret = radeonGetParam( sPriv->fd, RADEON_PARAM_GART_BUFFER_OFFSET,
++			    &screen->gart_buffer_offset);
 +
-+#include "main/simple_list.h"
-+#include "main/texcompress.h"
-+#include "main/texformat.h"
++      if (ret) {
++	 FREE( screen );
++	 fprintf(stderr, "drm_radeon_getparam_t (RADEON_PARAM_GART_BUFFER_OFFSET): %d\n", ret);
++	 return NULL;
++      }
 +
-+static GLuint radeon_compressed_texture_size(GLcontext *ctx,
-+		GLsizei width, GLsizei height, GLsizei depth,
-+		GLuint mesaFormat)
-+{
-+	GLuint size = _mesa_compressed_texture_size(ctx, width, height, depth, mesaFormat);
++      ret = radeonGetParam( sPriv->fd, RADEON_PARAM_GART_BASE,
++			    &screen->gart_base);
++      if (ret) {
++	 FREE( screen );
++	 fprintf(stderr, "drm_radeon_getparam_t (RADEON_PARAM_GART_BASE): %d\n", ret);
++	 return NULL;
++      }
 +
-+	if (mesaFormat == MESA_FORMAT_RGB_DXT1 ||
-+	    mesaFormat == MESA_FORMAT_RGBA_DXT1) {
-+		if (width + 3 < 8)	/* width one block */
-+			size = size * 4;
-+		else if (width + 3 < 16)
-+			size = size * 2;
-+	} else {
-+		/* DXT3/5, 16 bytes per block */
-+	  //		WARN_ONCE("DXT 3/5 suffers from multitexturing problems!\n");
-+		if (width + 3 < 8)
-+			size = size * 2;
-+	}
++      ret = radeonGetParam( sPriv->fd, RADEON_PARAM_IRQ_NR,
++			    &screen->irq);
++      if (ret) {
++	 FREE( screen );
++	 fprintf(stderr, "drm_radeon_getparam_t (RADEON_PARAM_IRQ_NR): %d\n", ret);
++	 return NULL;
++      }
++      screen->drmSupportsCubeMapsR200 = (sPriv->drm_version.minor >= 7);
++      screen->drmSupportsBlendColor = (sPriv->drm_version.minor >= 11);
++      screen->drmSupportsTriPerf = (sPriv->drm_version.minor >= 16);
++      screen->drmSupportsFragShader = (sPriv->drm_version.minor >= 18);
++      screen->drmSupportsPointSprites = (sPriv->drm_version.minor >= 13);
++      screen->drmSupportsCubeMapsR100 = (sPriv->drm_version.minor >= 15);
++      screen->drmSupportsVertexProgram = (sPriv->drm_version.minor >= 25);
++   }
 +
-+	return size;
-+}
++   if (!screen->kernel_mm) {
++     screen->mmio.handle = dri_priv->registerHandle;
++     screen->mmio.size   = dri_priv->registerSize;
++     if ( drmMap( sPriv->fd,
++		  screen->mmio.handle,
++		  screen->mmio.size,
++		  &screen->mmio.map ) ) {
++       FREE( screen );
++       __driUtilMessage("%s: drmMap failed\n", __FUNCTION__ );
++       return NULL;
++     }
++
++     RADEONMMIO = screen->mmio.map;
++
++     screen->status.handle = dri_priv->statusHandle;
++     screen->status.size   = dri_priv->statusSize;
++     if ( drmMap( sPriv->fd,
++		  screen->status.handle,
++		  screen->status.size,
++		  &screen->status.map ) ) {
++       drmUnmap( screen->mmio.map, screen->mmio.size );
++       FREE( screen );
++       __driUtilMessage("%s: drmMap (2) failed\n", __FUNCTION__ );
++       return NULL;
++     }
++     screen->scratch = (__volatile__ uint32_t *)
++       ((GLubyte *)screen->status.map + RADEON_SCRATCH_REG_OFFSET);
++
++     screen->buffers = drmMapBufs( sPriv->fd );
++     if ( !screen->buffers ) {
++       drmUnmap( screen->status.map, screen->status.size );
++       drmUnmap( screen->mmio.map, screen->mmio.size );
++       FREE( screen );
++       __driUtilMessage("%s: drmMapBufs failed\n", __FUNCTION__ );
++       return NULL;
++     }
++     
++     if ( dri_priv->gartTexHandle && dri_priv->gartTexMapSize ) {
++       screen->gartTextures.handle = dri_priv->gartTexHandle;
++       screen->gartTextures.size   = dri_priv->gartTexMapSize;
++       if ( drmMap( sPriv->fd,
++		    screen->gartTextures.handle,
++		    screen->gartTextures.size,
++		    (drmAddressPtr)&screen->gartTextures.map ) ) {
++	 drmUnmapBufs( screen->buffers );
++	 drmUnmap( screen->status.map, screen->status.size );
++	 drmUnmap( screen->mmio.map, screen->mmio.size );
++	 FREE( screen );
++	 __driUtilMessage("%s: drmMap failed for GART texture area\n", __FUNCTION__);
++	 return NULL;
++       }
++       
++       screen->gart_texture_offset = dri_priv->gartTexOffset + screen->gart_base;
++     }
++   }
 +
 +
-+static int radeon_compressed_num_bytes(GLuint mesaFormat)
-+{
-+   int bytes = 0;
-+   switch(mesaFormat) {
-+     
-+   case MESA_FORMAT_RGB_FXT1:
-+   case MESA_FORMAT_RGBA_FXT1:
-+   case MESA_FORMAT_RGB_DXT1:
-+   case MESA_FORMAT_RGBA_DXT1:
-+     bytes = 2;
-+     break;
-+     
-+   case MESA_FORMAT_RGBA_DXT3:
-+   case MESA_FORMAT_RGBA_DXT5:
-+     bytes = 4;
-+   default:
-+     break;
++   ret = radeon_set_screen_flags(screen, dri_priv->deviceID);
++   if (ret == -1)
++     return NULL;
++
+    if ((screen->chip_family == CHIP_FAMILY_R350 || screen->chip_family == CHIP_FAMILY_R300) &&
+        sPriv->ddx_version.minor < 2) {
+       fprintf(stderr, "xf86-video-ati-6.6.2 or newer needed for Radeon 9500/9700/9800 cards.\n");
+@@ -847,7 +881,7 @@ radeonCreateScreen( __DRIscreenPrivate *sPriv )
+    ret = radeonGetParam( sPriv->fd, RADEON_PARAM_FB_LOCATION,
+                          &temp);
+    if (ret) {
+-       if (screen->chip_family < CHIP_FAMILY_RS600)
++       if (screen->chip_family < CHIP_FAMILY_RS690 && !screen->kernel_mm)
+ 	   screen->fbLocation      = ( INREG( RADEON_MC_FB_LOCATION ) & 0xffff) << 16;
+        else {
+            FREE( screen );
+@@ -858,7 +892,7 @@ radeonCreateScreen( __DRIscreenPrivate *sPriv )
+        screen->fbLocation = (temp & 0xffff) << 16;
+    }
+ 
+-   if (screen->chip_family >= CHIP_FAMILY_R300) {
++   if (screen->chip_family >= CHIP_FAMILY_RV515) {
+        ret = radeonGetParam( sPriv->fd, RADEON_PARAM_NUM_GB_PIPES,
+ 			     &temp);
+        if (ret) {
+@@ -949,6 +983,103 @@ radeonCreateScreen( __DRIscreenPrivate *sPriv )
+        screen->extensions[i++] = &driMediaStreamCounterExtension.base;
+    }
+ 
++   if (!screen->kernel_mm) {
++#if !RADEON_COMMON
++   	screen->extensions[i++] = &radeonTexOffsetExtension.base;
++#endif
++
++#if RADEON_COMMON && defined(RADEON_COMMON_FOR_R200)
++        if (IS_R200_CLASS(screen))
++            screen->extensions[i++] = &r200AllocateExtension.base;
++
++        screen->extensions[i++] = &r200texOffsetExtension.base;
++#endif
++
++#if RADEON_COMMON && defined(RADEON_COMMON_FOR_R300)
++        screen->extensions[i++] = &r300texOffsetExtension.base;
++#endif
 +   }
-+   
-+   return bytes;
++
++   screen->extensions[i++] = NULL;
++   sPriv->extensions = screen->extensions;
++
++   screen->driScreen = sPriv;
++   screen->sarea_priv_offset = dri_priv->sarea_priv_offset;
++   screen->sarea = (drm_radeon_sarea_t *) ((GLubyte *) sPriv->pSAREA +
++					       screen->sarea_priv_offset);
++
++   if (screen->kernel_mm)
++     screen->bom = radeon_bo_manager_gem_ctor(sPriv->fd);
++   else
++     screen->bom = radeon_bo_manager_legacy_ctor(screen);
++   if (screen->bom == NULL) {
++     free(screen);
++     return NULL;
++   }
++
++   return screen;
 +}
 +
-+/**
-+ * Compute sizes and fill in offset and blit information for the given
-+ * image (determined by \p face and \p level).
-+ *
-+ * \param curOffset points to the offset at which the image is to be stored
-+ * and is updated by this function according to the size of the image.
-+ */
-+static void compute_tex_image_offset(radeon_mipmap_tree *mt,
-+	GLuint face, GLuint level, GLuint* curOffset)
++static radeonScreenPtr
++radeonCreateScreen2(__DRIscreenPrivate *sPriv)
 +{
-+	radeon_mipmap_level *lvl = &mt->levels[level];
++   radeonScreenPtr screen;
++   int i;
++   int ret;
++   uint32_t device_id;
++
++   /* Allocate the private area */
++   screen = (radeonScreenPtr) CALLOC( sizeof(*screen) );
++   if ( !screen ) {
++      __driUtilMessage("%s: Could not allocate memory for screen structure",
++		       __FUNCTION__);
++      fprintf(stderr, "leaving here\n");
++      return NULL;
++   }
 +
-+	/* Find image size in bytes */
-+	if (mt->compressed) {
-+		/* TODO: Is this correct? Need test cases for compressed textures! */
-+		GLuint align;
++#if DO_DEBUG && RADEON_COMMON && defined(RADEON_COMMON_FOR_R300)
++	RADEON_DEBUG = driParseDebugString(getenv("RADEON_DEBUG"), debug_control);
++#endif
 +
-+		lvl->rowstride = (lvl->width * mt->bpp + 63) & ~63;
-+		lvl->size = radeon_compressed_texture_size(mt->radeon->glCtx,
-+							   lvl->width, lvl->height, lvl->depth, mt->compressed);
-+	} else if (mt->target == GL_TEXTURE_RECTANGLE_NV) {
-+		lvl->rowstride = (lvl->width * mt->bpp + 63) & ~63;
-+		lvl->size = lvl->rowstride * lvl->height;
-+	} else if (mt->tilebits & RADEON_TXO_MICRO_TILE) {
-+		/* tile pattern is 16 bytes x2. mipmaps stay 32 byte aligned,
-+		 * though the actual offset may be different (if texture is less than
-+		 * 32 bytes width) to the untiled case */
-+		lvl->rowstride = (lvl->width * mt->bpp * 2 + 31) & ~31;
-+		lvl->size = lvl->rowstride * ((lvl->height + 1) / 2) * lvl->depth;
-+	} else {
-+		lvl->rowstride = (lvl->width * mt->bpp + 31) & ~31;
-+		lvl->size = lvl->rowstride * lvl->height * lvl->depth;
-+	}
-+	assert(lvl->size > 0);
++   /* parse information in __driConfigOptions */
++   driParseOptionInfo (&screen->optionCache,
++		       __driConfigOptions, __driNConfigOptions);
 +
-+	/* All images are aligned to a 32-byte offset */
-+	*curOffset = (*curOffset + 0x1f) & ~0x1f;
-+	lvl->faces[face].offset = *curOffset;
-+	*curOffset += lvl->size;
++   screen->kernel_mm = 1;
++   screen->chip_flags = 0;
 +
-+	if (RADEON_DEBUG & DEBUG_TEXTURE)
-+	  fprintf(stderr,
-+		  "level %d, face %d: rs:%d %dx%d at %d\n",
-+		  level, face, lvl->rowstride, lvl->width, lvl->height, lvl->faces[face].offset);
-+}
++   ret = radeonGetParam( sPriv->fd, RADEON_PARAM_IRQ_NR,
++			 &screen->irq);
 +
-+static GLuint minify(GLuint size, GLuint levels)
++   ret = radeonGetParam( sPriv->fd, RADEON_PARAM_DEVICE_ID,
++			 &device_id);
++   if (ret) {
++     FREE( screen );
++     fprintf(stderr, "drm_radeon_getparam_t (RADEON_PARAM_DEVICE_ID): %d\n", ret);
++     return NULL;
++   }
++
++   ret = radeon_set_screen_flags(screen, device_id);
++   if (ret == -1)
++     return NULL;
++
++   if (screen->chip_family <= CHIP_FAMILY_RS200)
++      screen->chip_flags |= RADEON_CLASS_R100;
++   else if (screen->chip_family <= CHIP_FAMILY_RV280)
++      screen->chip_flags |= RADEON_CLASS_R200;
++   else
++      screen->chip_flags |= RADEON_CLASS_R300;
++
++   i = 0;
++   screen->extensions[i++] = &driCopySubBufferExtension.base;
++   screen->extensions[i++] = &driFrameTrackingExtension.base;
++   screen->extensions[i++] = &driReadDrawableExtension;
++
++   if ( screen->irq != 0 ) {
++       screen->extensions[i++] = &driSwapControlExtension.base;
++       screen->extensions[i++] = &driMediaStreamCounterExtension.base;
++   }
++
+ #if !RADEON_COMMON
+    screen->extensions[i++] = &radeonTexOffsetExtension.base;
+ #endif
+@@ -961,14 +1092,19 @@ radeonCreateScreen( __DRIscreenPrivate *sPriv )
+ #endif
+ 
+ #if RADEON_COMMON && defined(RADEON_COMMON_FOR_R300)
+-   screen->extensions[i++] = &r300texOffsetExtension.base;
++   //screen->extensions[i++] = &r300texOffsetExtension.base;
++   screen->extensions[i++] = &r300TexBufferExtension.base;
+ #endif
+ 
+    screen->extensions[i++] = NULL;
+    sPriv->extensions = screen->extensions;
+ 
+    screen->driScreen = sPriv;
+-   screen->sarea_priv_offset = dri_priv->sarea_priv_offset;
++   screen->bom = radeon_bo_manager_gem_ctor(sPriv->fd);
++   if (screen->bom == NULL) {
++       free(screen);
++       return NULL;
++   }
+    return screen;
+ }
+ 
+@@ -977,23 +1113,32 @@ radeonCreateScreen( __DRIscreenPrivate *sPriv )
+ static void
+ radeonDestroyScreen( __DRIscreenPrivate *sPriv )
+ {
+-   radeonScreenPtr screen = (radeonScreenPtr)sPriv->private;
++    radeonScreenPtr screen = (radeonScreenPtr)sPriv->private;
+ 
+-   if (!screen)
+-      return;
++    if (!screen)
++        return;
+ 
+-   if ( screen->gartTextures.map ) {
+-      drmUnmap( screen->gartTextures.map, screen->gartTextures.size );
+-   }
+-   drmUnmapBufs( screen->buffers );
+-   drmUnmap( screen->status.map, screen->status.size );
+-   drmUnmap( screen->mmio.map, screen->mmio.size );
++    if (screen->kernel_mm) {
++#ifdef RADEON_BO_TRACK
++        radeon_tracker_print(&screen->bom->tracker, stderr);
++#endif
++        radeon_bo_manager_gem_dtor(screen->bom);
++    } else {
++        radeon_bo_manager_legacy_dtor(screen->bom);
++
++        if ( screen->gartTextures.map ) {
++            drmUnmap( screen->gartTextures.map, screen->gartTextures.size );
++        }
++        drmUnmapBufs( screen->buffers );
++        drmUnmap( screen->status.map, screen->status.size );
++        drmUnmap( screen->mmio.map, screen->mmio.size );
++    }
+ 
+-   /* free all option information */
+-   driDestroyOptionInfo (&screen->optionCache);
++    /* free all option information */
++    driDestroyOptionInfo (&screen->optionCache);
+ 
+-   FREE( screen );
+-   sPriv->private = NULL;
++    FREE( screen );
++    sPriv->private = NULL;
+ }
+ 
+ 
+@@ -1002,15 +1147,102 @@ radeonDestroyScreen( __DRIscreenPrivate *sPriv )
+ static GLboolean
+ radeonInitDriver( __DRIscreenPrivate *sPriv )
+ {
+-   sPriv->private = (void *) radeonCreateScreen( sPriv );
+-   if ( !sPriv->private ) {
+-      radeonDestroyScreen( sPriv );
+-      return GL_FALSE;
+-   }
++    if (sPriv->dri2.enabled) {
++        sPriv->private = (void *) radeonCreateScreen2( sPriv );
++    } else {
++        sPriv->private = (void *) radeonCreateScreen( sPriv );
++    }
++    if ( !sPriv->private ) {
++        radeonDestroyScreen( sPriv );
++        return GL_FALSE;
++    }
+ 
+-   return GL_TRUE;
++    return GL_TRUE;
+ }
+ 
++static GLboolean
++radeon_alloc_window_storage(GLcontext *ctx, struct gl_renderbuffer *rb,
++			    GLenum intFormat, GLuint w, GLuint h)
 +{
-+	size = size >> levels;
-+	if (size < 1)
-+		size = 1;
-+	return size;
++    rb->Width = w;
++    rb->Height = h;
++    rb->_ActualFormat = intFormat;
++
++    return GL_TRUE;
 +}
 +
-+static void calculate_miptree_layout(radeon_mipmap_tree *mt)
++
++static struct radeon_renderbuffer *
++radeon_create_renderbuffer(GLenum format, __DRIdrawablePrivate *driDrawPriv)
 +{
-+	GLuint curOffset;
-+	GLuint numLevels;
-+	GLuint i;
++    struct radeon_renderbuffer *ret;
 +
-+	numLevels = mt->lastLevel - mt->firstLevel + 1;
-+	assert(numLevels <= RADEON_MAX_TEXTURE_LEVELS);
++    ret = CALLOC_STRUCT(radeon_renderbuffer);
++    if (!ret)
++	return NULL;
 +
-+	curOffset = 0;
-+	for(i = 0; i < numLevels; i++) {
-+		GLuint face;
++    _mesa_init_renderbuffer(&ret->base, 0);
 +
-+		mt->levels[i].width = minify(mt->width0, i);
-+		mt->levels[i].height = minify(mt->height0, i);
-+		mt->levels[i].depth = minify(mt->depth0, i);
++    /* XXX format junk */
++    switch (format) {
++	case GL_RGB5:
++	    ret->base._ActualFormat = GL_RGB5;
++	    ret->base._BaseFormat = GL_RGBA;
++	    ret->base.RedBits = 5;
++	    ret->base.GreenBits = 6;
++	    ret->base.BlueBits = 5;
++	    ret->base.DataType = GL_UNSIGNED_BYTE;
++	    break;
++	case GL_RGBA8:
++	    ret->base._ActualFormat = GL_RGBA8;
++	    ret->base._BaseFormat = GL_RGBA;
++	    ret->base.RedBits = 8;
++	    ret->base.GreenBits = 8;
++	    ret->base.BlueBits = 8;
++	    ret->base.AlphaBits = 8;
++	    ret->base.DataType = GL_UNSIGNED_BYTE;
++	    break;
++	case GL_STENCIL_INDEX8_EXT:
++	    ret->base._ActualFormat = GL_STENCIL_INDEX8_EXT;
++	    ret->base._BaseFormat = GL_STENCIL_INDEX;
++	    ret->base.StencilBits = 8;
++	    ret->base.DataType = GL_UNSIGNED_BYTE;
++	    break;
++	case GL_DEPTH_COMPONENT16:
++	    ret->base._ActualFormat = GL_DEPTH_COMPONENT16;
++	    ret->base._BaseFormat = GL_DEPTH_COMPONENT;
++	    ret->base.DepthBits = 16;
++	    ret->base.DataType = GL_UNSIGNED_SHORT;
++	    break;
++	case GL_DEPTH_COMPONENT24:
++	    ret->base._ActualFormat = GL_DEPTH24_STENCIL8_EXT;
++	    ret->base._BaseFormat = GL_DEPTH_COMPONENT;
++	    ret->base.DepthBits = 24;
++	    ret->base.DataType = GL_UNSIGNED_INT;
++	    break;
++	case GL_DEPTH24_STENCIL8_EXT:
++	    ret->base._ActualFormat = GL_DEPTH24_STENCIL8_EXT;
++	    ret->base._BaseFormat = GL_DEPTH_STENCIL_EXT;
++	    ret->base.DepthBits = 24;
++	    ret->base.StencilBits = 8;
++	    ret->base.DataType = GL_UNSIGNED_INT_24_8_EXT;
++	    break;
++	default:
++	    fprintf(stderr, "%s: Unknown format 0x%04x\n", __FUNCTION__, format);
++	    _mesa_delete_renderbuffer(&ret->base);
++	    return NULL;
++    }
 +
-+		for(face = 0; face < mt->faces; face++)
-+			compute_tex_image_offset(mt, face, i, &curOffset);
-+	}
++    ret->dPriv = driDrawPriv;
++    ret->base.InternalFormat = format;
 +
-+	/* Note the required size in memory */
-+	mt->totalsize = (curOffset + RADEON_OFFSET_MASK) & ~RADEON_OFFSET_MASK;
-+}
++    ret->base.AllocStorage = radeon_alloc_window_storage;
 +
++    radeonSetSpanFunctions(ret);
 +
++    ret->bo = NULL;
++    return ret;
++}
+ 
+ /**
+  * Create the Mesa framebuffer and renderbuffers for a given window/drawable.
+@@ -1026,95 +1258,86 @@ radeonCreateBuffer( __DRIscreenPrivate *driScrnPriv,
+ {
+    radeonScreenPtr screen = (radeonScreenPtr) driScrnPriv->private;
+ 
+-   if (isPixmap) {
+-      return GL_FALSE; /* not implemented */
+-   }
+-   else {
+-      const GLboolean swDepth = GL_FALSE;
+-      const GLboolean swAlpha = GL_FALSE;
+-      const GLboolean swAccum = mesaVis->accumRedBits > 0;
+-      const GLboolean swStencil = mesaVis->stencilBits > 0 &&
+-         mesaVis->depthBits != 24;
+-      struct gl_framebuffer *fb = _mesa_create_framebuffer(mesaVis);
+-
+-      /* front color renderbuffer */
+-      {
+-         driRenderbuffer *frontRb
+-            = driNewRenderbuffer(GL_RGBA,
+-                                 driScrnPriv->pFB + screen->frontOffset,
+-                                 screen->cpp,
+-                                 screen->frontOffset, screen->frontPitch,
+-                                 driDrawPriv);
+-         radeonSetSpanFunctions(frontRb, mesaVis);
+-         _mesa_add_renderbuffer(fb, BUFFER_FRONT_LEFT, &frontRb->Base);
+-      }
++    const GLboolean swDepth = GL_FALSE;
++    const GLboolean swAlpha = GL_FALSE;
++    const GLboolean swAccum = mesaVis->accumRedBits > 0;
++    const GLboolean swStencil = mesaVis->stencilBits > 0 &&
++	mesaVis->depthBits != 24;
++    GLenum rgbFormat = (mesaVis->redBits == 5 ? GL_RGB5 : GL_RGBA8);
++    GLenum depthFormat = GL_NONE;
++    struct gl_framebuffer *fb = _mesa_create_framebuffer(mesaVis);
++
++    if (mesaVis->depthBits == 16)
++	depthFormat = GL_DEPTH_COMPONENT16;
++    else if (mesaVis->depthBits == 24)
++	depthFormat = GL_DEPTH_COMPONENT24;
++
++    /* front color renderbuffer */
++    {
++	struct radeon_renderbuffer *front =
++	    radeon_create_renderbuffer(rgbFormat, driDrawPriv);
++	_mesa_add_renderbuffer(fb, BUFFER_FRONT_LEFT, &front->base);
++	front->has_surface = 1;
++    }
+ 
+-      /* back color renderbuffer */
+-      if (mesaVis->doubleBufferMode) {
+-         driRenderbuffer *backRb
+-            = driNewRenderbuffer(GL_RGBA,
+-                                 driScrnPriv->pFB + screen->backOffset,
+-                                 screen->cpp,
+-                                 screen->backOffset, screen->backPitch,
+-                                 driDrawPriv);
+-         radeonSetSpanFunctions(backRb, mesaVis);
+-         _mesa_add_renderbuffer(fb, BUFFER_BACK_LEFT, &backRb->Base);
+-      }
++    /* back color renderbuffer */
++    if (mesaVis->doubleBufferMode) {
++	struct radeon_renderbuffer *back =
++	    radeon_create_renderbuffer(rgbFormat, driDrawPriv);
++	_mesa_add_renderbuffer(fb, BUFFER_BACK_LEFT, &back->base);
++	back->has_surface = 1;
++    }
+ 
+-      /* depth renderbuffer */
+-      if (mesaVis->depthBits == 16) {
+-         driRenderbuffer *depthRb
+-            = driNewRenderbuffer(GL_DEPTH_COMPONENT16,
+-                                 driScrnPriv->pFB + screen->depthOffset,
+-                                 screen->cpp,
+-                                 screen->depthOffset, screen->depthPitch,
+-                                 driDrawPriv);
+-         radeonSetSpanFunctions(depthRb, mesaVis);
+-         _mesa_add_renderbuffer(fb, BUFFER_DEPTH, &depthRb->Base);
+-	 depthRb->depthHasSurface = screen->depthHasSurface;
+-      }
+-      else if (mesaVis->depthBits == 24) {
+-         driRenderbuffer *depthRb
+-            = driNewRenderbuffer(GL_DEPTH_COMPONENT24,
+-                                 driScrnPriv->pFB + screen->depthOffset,
+-                                 screen->cpp,
+-                                 screen->depthOffset, screen->depthPitch,
+-                                 driDrawPriv);
+-         radeonSetSpanFunctions(depthRb, mesaVis);
+-         _mesa_add_renderbuffer(fb, BUFFER_DEPTH, &depthRb->Base);
+-	 depthRb->depthHasSurface = screen->depthHasSurface;
+-      }
++    /* depth renderbuffer */
++    if (depthFormat != GL_NONE) {
++	struct radeon_renderbuffer *depth =
++	    radeon_create_renderbuffer(depthFormat, driDrawPriv);
++	_mesa_add_renderbuffer(fb, BUFFER_DEPTH, &depth->base);
++	depth->has_surface = screen->depthHasSurface;
++    }
+ 
+-      /* stencil renderbuffer */
+-      if (mesaVis->stencilBits > 0 && !swStencil) {
+-         driRenderbuffer *stencilRb
+-            = driNewRenderbuffer(GL_STENCIL_INDEX8_EXT,
+-                                 driScrnPriv->pFB + screen->depthOffset,
+-                                 screen->cpp,
+-                                 screen->depthOffset, screen->depthPitch,
+-                                 driDrawPriv);
+-         radeonSetSpanFunctions(stencilRb, mesaVis);
+-         _mesa_add_renderbuffer(fb, BUFFER_STENCIL, &stencilRb->Base);
+-	 stencilRb->depthHasSurface = screen->depthHasSurface;
+-      }
++    /* stencil renderbuffer */
++    if (mesaVis->stencilBits > 0 && !swStencil) {
++	struct radeon_renderbuffer *stencil =
++	    radeon_create_renderbuffer(GL_STENCIL_INDEX8_EXT, driDrawPriv);
++	_mesa_add_renderbuffer(fb, BUFFER_STENCIL, &stencil->base);
++	stencil->has_surface = screen->depthHasSurface;
++    }
+ 
+-      _mesa_add_soft_renderbuffers(fb,
+-                                   GL_FALSE, /* color */
+-                                   swDepth,
+-                                   swStencil,
+-                                   swAccum,
+-                                   swAlpha,
+-                                   GL_FALSE /* aux */);
+-      driDrawPriv->driverPrivate = (void *) fb;
++    _mesa_add_soft_renderbuffers(fb,
++	    GL_FALSE, /* color */
++	    swDepth,
++	    swStencil,
++	    swAccum,
++	    swAlpha,
++	    GL_FALSE /* aux */);
++    driDrawPriv->driverPrivate = (void *) fb;
+ 
+-      return (driDrawPriv->driverPrivate != NULL);
+-   }
++    return (driDrawPriv->driverPrivate != NULL);
+ }
+ 
+-
+ static void
+ radeonDestroyBuffer(__DRIdrawablePrivate *driDrawPriv)
+ {
++	struct radeon_renderbuffer *rb;
++	GLframebuffer *fb;
++    
++    fb = (void*)driDrawPriv->driverPrivate;
++    rb = (void *)fb->Attachment[BUFFER_FRONT_LEFT].Renderbuffer;
++    if (rb && rb->bo) {
++        radeon_bo_unref(rb->bo);
++        rb->bo = NULL;
++    }
++    rb = (void *)fb->Attachment[BUFFER_BACK_LEFT].Renderbuffer;
++    if (rb && rb->bo) {
++        radeon_bo_unref(rb->bo);
++        rb->bo = NULL;
++    }
++    rb = (void *)fb->Attachment[BUFFER_DEPTH].Renderbuffer;
++    if (rb && rb->bo) {
++        radeon_bo_unref(rb->bo);
++        rb->bo = NULL;
++    }
+    _mesa_unreference_framebuffer((GLframebuffer **)(&(driDrawPriv->driverPrivate)));
+ }
+ 
+@@ -1149,6 +1372,7 @@ static void radeonDestroyContext(__DRIcontextPrivate * driContextPriv)
+ 
+ #endif
+ 
++
+ /**
+  * This is the driver specific part of the createNewScreen entry point.
+  *
+@@ -1201,18 +1425,109 @@ radeonInitScreen(__DRIscreenPrivate *psp)
+    driInitSingleExtension( NULL, NV_vp_extension );
+    driInitSingleExtension( NULL, ATI_fs_extension );
+    driInitExtensions( NULL, point_extensions, GL_FALSE );
++#elif defined(RADEON_COMMON_FOR_R300)
++   driInitSingleExtension( NULL, gl_20_extension );
+ #endif
+ 
+    if (!radeonInitDriver(psp))
+        return NULL;
+ 
++   /* for now fill in all modes */
+    return radeonFillInModes( psp,
+ 			     dri_priv->bpp,
+ 			     (dri_priv->bpp == 16) ? 16 : 24,
+-			     (dri_priv->bpp == 16) ? 0  : 8,
+-			     (dri_priv->backOffset != dri_priv->depthOffset) );
++			     (dri_priv->bpp == 16) ? 0  : 8, 1);
+ }
++#define ARRAY_SIZE(a) (sizeof (a) / sizeof ((a)[0]))
+ 
 +/**
-+ * Create a new mipmap tree, calculate its layout and allocate memory.
++ * This is the driver specific part of the createNewScreen entry point.
++ * Called when using DRI2.
++ *
++ * \return the __GLcontextModes supported by this driver
 + */
-+radeon_mipmap_tree* radeon_miptree_create(radeonContextPtr rmesa, radeonTexObj *t,
-+		GLenum target, GLuint firstLevel, GLuint lastLevel,
-+		GLuint width0, GLuint height0, GLuint depth0,
-+		GLuint bpp, GLuint tilebits, GLuint compressed)
++static const
++__DRIconfig **radeonInitScreen2(__DRIscreenPrivate *psp)
 +{
-+	radeon_mipmap_tree *mt = CALLOC_STRUCT(_radeon_mipmap_tree);
-+
-+	mt->radeon = rmesa;
-+	mt->refcount = 1;
-+	mt->t = t;
-+	mt->target = target;
-+	mt->faces = (target == GL_TEXTURE_CUBE_MAP) ? 6 : 1;
-+	mt->firstLevel = firstLevel;
-+	mt->lastLevel = lastLevel;
-+	mt->width0 = width0;
-+	mt->height0 = height0;
-+	mt->depth0 = depth0;
-+	mt->bpp = compressed ? radeon_compressed_num_bytes(compressed) : bpp;
-+	mt->tilebits = tilebits;
-+	mt->compressed = compressed;
++   GLenum fb_format[3];
++   GLenum fb_type[3];
++   /* GLX_SWAP_COPY_OML is only supported because the Intel driver doesn't
++    * support pageflipping at all.
++    */
++   static const GLenum back_buffer_modes[] = {
++     GLX_NONE, GLX_SWAP_UNDEFINED_OML, /*, GLX_SWAP_COPY_OML*/
++   };
++   uint8_t depth_bits[4], stencil_bits[4], msaa_samples_array[1];
++   int color;
++   __DRIconfig **configs = NULL;
++
++   /* Calling driInitExtensions here, with a NULL context pointer,
++    * does not actually enable the extensions.  It just makes sure
++    * that all the dispatch offsets for all the extensions that
++    * *might* be enables are known.  This is needed because the
++    * dispatch offsets need to be known when _mesa_context_create
++    * is called, but we can't enable the extensions until we have a
++    * context pointer.
++    *
++    * Hello chicken.  Hello egg.  How are you two today?
++    */
++   driInitExtensions( NULL, card_extensions, GL_FALSE );
++#if RADEON_COMMON && defined(RADEON_COMMON_FOR_R200)
++   driInitExtensions( NULL, blend_extensions, GL_FALSE );
++   driInitSingleExtension( NULL, ARB_vp_extension );
++   driInitSingleExtension( NULL, NV_vp_extension );
++   driInitSingleExtension( NULL, ATI_fs_extension );
++   driInitExtensions( NULL, point_extensions, GL_FALSE );
++#endif
 +
-+	calculate_miptree_layout(mt);
++   if (!radeonInitDriver(psp)) {
++       return NULL;
++    }
++   depth_bits[0] = 0;
++   stencil_bits[0] = 0;
++   depth_bits[1] = 16;
++   stencil_bits[1] = 0;
++   depth_bits[2] = 24;
++   stencil_bits[2] = 0;
++   depth_bits[3] = 24;
++   stencil_bits[3] = 8;
++
++   msaa_samples_array[0] = 0;
++
++   fb_format[0] = GL_RGB;
++   fb_type[0] = GL_UNSIGNED_SHORT_5_6_5;
++
++   fb_format[1] = GL_BGR;
++   fb_type[1] = GL_UNSIGNED_INT_8_8_8_8_REV;
++
++   fb_format[2] = GL_BGRA;
++   fb_type[2] = GL_UNSIGNED_INT_8_8_8_8_REV;
++
++   for (color = 0; color < ARRAY_SIZE(fb_format); color++) {
++      __DRIconfig **new_configs;
++
++      new_configs = driCreateConfigs(fb_format[color], fb_type[color],
++				     depth_bits,
++				     stencil_bits,
++				     ARRAY_SIZE(depth_bits),
++				     back_buffer_modes,
++				     ARRAY_SIZE(back_buffer_modes),
++				     msaa_samples_array,
++				     ARRAY_SIZE(msaa_samples_array));
++      if (configs == NULL)
++	 configs = new_configs;
++      else
++	 configs = driConcatConfigs(configs, new_configs);
++   }
 +
-+	mt->bo = radeon_bo_open(rmesa->radeonScreen->bom,
-+                            0, mt->totalsize, 1024,
-+                            RADEON_GEM_DOMAIN_VRAM,
-+                            0);
++   if (configs == NULL) {
++      fprintf(stderr, "[%s:%u] Error creating FBConfig!\n", __func__,
++              __LINE__);
++      return NULL;
++   }
 +
-+	return mt;
++   return (const __DRIconfig **)configs;
 +}
-+
-+void radeon_miptree_reference(radeon_mipmap_tree *mt)
+ 
+ /**
+  * Get information about previous buffer swaps.
+@@ -1220,11 +1535,7 @@ radeonInitScreen(__DRIscreenPrivate *psp)
+ static int
+ getSwapInfo( __DRIdrawablePrivate *dPriv, __DRIswapInfo * sInfo )
+ {
+-#if !RADEON_COMMON || (RADEON_COMMON && defined(RADEON_COMMON_FOR_R300))
+    radeonContextPtr  rmesa;
+-#elif RADEON_COMMON && defined(RADEON_COMMON_FOR_R200)
+-   r200ContextPtr  rmesa;
+-#endif
+ 
+    if ( (dPriv == NULL) || (dPriv->driContextPriv == NULL)
+ 	|| (dPriv->driContextPriv->driverPrivate == NULL)
+@@ -1261,6 +1572,8 @@ const struct __DriverAPIRec driDriverAPI = {
+    .WaitForSBC      = NULL,
+    .SwapBuffersMSC  = NULL,
+    .CopySubBuffer   = radeonCopySubBuffer,
++    /* DRI2 */
++   .InitScreen2     = radeonInitScreen2,
+ };
+ #else
+ const struct __DriverAPIRec driDriverAPI = {
+@@ -1270,14 +1583,16 @@ const struct __DriverAPIRec driDriverAPI = {
+    .DestroyContext  = r200DestroyContext,
+    .CreateBuffer    = radeonCreateBuffer,
+    .DestroyBuffer   = radeonDestroyBuffer,
+-   .SwapBuffers     = r200SwapBuffers,
+-   .MakeCurrent     = r200MakeCurrent,
+-   .UnbindContext   = r200UnbindContext,
++   .SwapBuffers     = radeonSwapBuffers,
++   .MakeCurrent     = radeonMakeCurrent,
++   .UnbindContext   = radeonUnbindContext,
+    .GetSwapInfo     = getSwapInfo,
+    .GetDrawableMSC  = driDrawableGetMSC32,
+    .WaitForMSC      = driWaitForMSC32,
+    .WaitForSBC      = NULL,
+    .SwapBuffersMSC  = NULL,
+-   .CopySubBuffer   = r200CopySubBuffer,
++   .CopySubBuffer   = radeonCopySubBuffer,
++   .InitScreen2     = radeonInitScreen2,
+ };
+ #endif
++
+diff --git a/src/mesa/drivers/dri/radeon/radeon_screen.h b/src/mesa/drivers/dri/radeon/radeon_screen.h
+index b84c70b..1c0f5bb 100644
+--- a/src/mesa/drivers/dri/radeon/radeon_screen.h
++++ b/src/mesa/drivers/dri/radeon/radeon_screen.h
+@@ -54,7 +54,7 @@ typedef struct {
+    drmAddress map;			/* Mapping of the DRM region */
+ } radeonRegionRec, *radeonRegionPtr;
+ 
+-typedef struct {
++typedef struct radeon_screen {
+    int chip_family;
+    int chip_flags;
+    int cpp;
+@@ -103,9 +103,12 @@ typedef struct {
+    /* Configuration cache with default values for all contexts */
+    driOptionCache optionCache;
+ 
+-   const __DRIextension *extensions[8];
++   const __DRIextension *extensions[16];
+ 
+    int num_gb_pipes;
++   int kernel_mm;
++   drm_radeon_sarea_t *sarea;	/* Private SAREA data */
++   struct radeon_bo_manager *bom;
+ } radeonScreenRec, *radeonScreenPtr;
+ 
+ #define IS_R100_CLASS(screen) \
+diff --git a/src/mesa/drivers/dri/radeon/radeon_span.c b/src/mesa/drivers/dri/radeon/radeon_span.c
+index 12051ff..49ec2c3 100644
+--- a/src/mesa/drivers/dri/radeon/radeon_span.c
++++ b/src/mesa/drivers/dri/radeon/radeon_span.c
+@@ -43,37 +43,168 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ #include "main/glheader.h"
+ #include "swrast/swrast.h"
+ 
+-#include "radeon_context.h"
+-#include "radeon_ioctl.h"
+-#include "radeon_state.h"
++#include "radeon_common.h"
++#include "radeon_lock.h"
+ #include "radeon_span.h"
+-#include "radeon_tex.h"
+-
+-#include "drirenderbuffer.h"
+ 
+ #define DBG 0
+ 
++static GLubyte *radeon_ptr32(const struct radeon_renderbuffer * rrb,
++			     GLint x, GLint y)
 +{
-+	mt->refcount++;
-+	assert(mt->refcount > 0);
++    GLubyte *ptr = rrb->bo->ptr;
++    const __DRIdrawablePrivate *dPriv = rrb->dPriv;
++    uint32_t mask = RADEON_BO_FLAGS_MACRO_TILE | RADEON_BO_FLAGS_MICRO_TILE;
++    GLint offset;
++    GLint nmacroblkpl;
++    GLint nmicroblkpl;
++
++    x += dPriv->x;
++    y += dPriv->y;
++
++    if (rrb->has_surface || !(rrb->bo->flags & mask)) {
++        offset = x * rrb->cpp + y * rrb->pitch;
++    } else {
++        offset = 0;
++        if (rrb->bo->flags & RADEON_BO_FLAGS_MACRO_TILE) {
++            if (rrb->bo->flags & RADEON_BO_FLAGS_MICRO_TILE) {
++                nmacroblkpl = rrb->pitch >> 5;
++                offset += ((y >> 4) * nmacroblkpl) << 11;
++                offset += ((y & 15) >> 1) << 8;
++                offset += (y & 1) << 4;
++                offset += (x >> 5) << 11;
++                offset += ((x & 31) >> 2) << 5;
++                offset += (x & 3) << 2;
++            } else {
++                nmacroblkpl = rrb->pitch >> 6;
++                offset += ((y >> 3) * nmacroblkpl) << 11;
++                offset += (y & 7) << 8;
++                offset += (x >> 6) << 11;
++                offset += ((x & 63) >> 3) << 5;
++                offset += (x & 7) << 2;
++            }
++        } else {
++            nmicroblkpl = ((rrb->pitch + 31) & ~31) >> 5;
++            offset += (y * nmicroblkpl) << 5;
++            offset += (x >> 3) << 5;
++            offset += (x & 7) << 2;
++        }
++    }
++    return &ptr[offset];
 +}
 +
-+void radeon_miptree_unreference(radeon_mipmap_tree *mt)
++static GLubyte *radeon_ptr16(const struct radeon_renderbuffer * rrb,
++			     GLint x, GLint y)
 +{
-+	if (!mt)
-+		return;
-+
-+	assert(mt->refcount > 0);
-+	mt->refcount--;
-+	if (!mt->refcount) {
-+		radeon_bo_unref(mt->bo);
-+		free(mt);
-+	}
++    GLubyte *ptr = rrb->bo->ptr;
++    const __DRIdrawablePrivate *dPriv = rrb->dPriv;
++    uint32_t mask = RADEON_BO_FLAGS_MACRO_TILE | RADEON_BO_FLAGS_MICRO_TILE;
++    GLint offset;
++    GLint nmacroblkpl;
++    GLint nmicroblkpl;
++
++    x += dPriv->x;
++    y += dPriv->y;
++
++    if (rrb->has_surface || !(rrb->bo->flags & mask)) {
++        offset = x * rrb->cpp + y * rrb->pitch;
++    } else {
++        offset = 0;
++        if (rrb->bo->flags & RADEON_BO_FLAGS_MACRO_TILE) {
++            if (rrb->bo->flags & RADEON_BO_FLAGS_MICRO_TILE) {
++                nmacroblkpl = rrb->pitch >> 6;
++                offset += ((y >> 4) * nmacroblkpl) << 11;
++                offset += ((y & 15) >> 1) << 8;
++                offset += (y & 1) << 4;
++                offset += (x >> 6) << 11;
++                offset += ((x & 63) >> 3) << 5;
++                offset += (x & 7) << 1;
++            } else {
++                nmacroblkpl = rrb->pitch >> 7;
++                offset += ((y >> 3) * nmacroblkpl) << 11;
++                offset += (y & 7) << 8;
++                offset += (x >> 7) << 11;
++                offset += ((x & 127) >> 4) << 5;
++                offset += (x & 15) << 2;
++            }
++        } else {
++            nmicroblkpl = ((rrb->pitch + 31) & ~31) >> 5;
++            offset += (y * nmicroblkpl) << 5;
++            offset += (x >> 4) << 5;
++            offset += (x & 15) << 2;
++        }
++    }
++    return &ptr[offset];
 +}
 +
-+
-+/**
-+ * Calculate first and last mip levels for the given texture object,
-+ * where the dimensions are taken from the given texture image at
-+ * the given level.
-+ *
-+ * Note: level is the OpenGL level number, which is not necessarily the same
-+ * as the first level that is actually present.
-+ *
-+ * The base level image of the given texture face must be non-null,
-+ * or this will fail.
-+ */
-+static void calculate_first_last_level(struct gl_texture_object *tObj,
-+				       GLuint *pfirstLevel, GLuint *plastLevel,
-+				       GLuint face, GLuint level)
++static GLubyte *radeon_ptr(const struct radeon_renderbuffer * rrb,
++			   GLint x, GLint y)
 +{
-+	const struct gl_texture_image * const baseImage =
-+		tObj->Image[face][level];
-+
-+	assert(baseImage);
-+	
-+	/* These must be signed values.  MinLod and MaxLod can be negative numbers,
-+	* and having firstLevel and lastLevel as signed prevents the need for
-+	* extra sign checks.
-+	*/
-+	int   firstLevel;
-+	int   lastLevel;
++    GLubyte *ptr = rrb->bo->ptr;
++    const __DRIdrawablePrivate *dPriv = rrb->dPriv;
++    uint32_t mask = RADEON_BO_FLAGS_MACRO_TILE | RADEON_BO_FLAGS_MICRO_TILE;
++    GLint offset;
++    GLint microblkxs;
++    GLint macroblkxs;
++    GLint nmacroblkpl;
++    GLint nmicroblkpl;
++
++    x += dPriv->x;
++    y += dPriv->y;
++
++    if (rrb->has_surface || !(rrb->bo->flags & mask)) {
++        offset = x * rrb->cpp + y * rrb->pitch;
++    } else {
++        offset = 0;
++        if (rrb->bo->flags & RADEON_BO_FLAGS_MACRO_TILE) {
++            if (rrb->bo->flags & RADEON_BO_FLAGS_MICRO_TILE) {
++                microblkxs = 16 / rrb->cpp;
++                macroblkxs = 128 / rrb->cpp;
++                nmacroblkpl = rrb->pitch / macroblkxs;
++                offset += ((y >> 4) * nmacroblkpl) << 11;
++                offset += ((y & 15) >> 1) << 8;
++                offset += (y & 1) << 4;
++                offset += (x / macroblkxs) << 11;
++                offset += ((x & (macroblkxs - 1)) / microblkxs) << 5;
++                offset += (x & (microblkxs - 1)) * rrb->cpp;
++            } else {
++                microblkxs = 32 / rrb->cpp;
++                macroblkxs = 256 / rrb->cpp;
++                nmacroblkpl = rrb->pitch / macroblkxs;
++                offset += ((y >> 3) * nmacroblkpl) << 11;
++                offset += (y & 7) << 8;
++                offset += (x / macroblkxs) << 11;
++                offset += ((x & (macroblkxs - 1)) / microblkxs) << 5;
++                offset += (x & (microblkxs - 1)) * rrb->cpp;
++            }
++        } else {
++            microblkxs = 32 / rrb->cpp;
++            nmicroblkpl = ((rrb->pitch + 31) & ~31) >> 5;
++            offset += (y * nmicroblkpl) << 5;
++            offset += (x / microblkxs) << 5;
++            offset += (x & (microblkxs - 1)) * rrb->cpp;
++        }
++    }
++    return &ptr[offset];
++}
 +
-+	/* Yes, this looks overly complicated, but it's all needed.
-+	*/
-+	switch (tObj->Target) {
-+	case GL_TEXTURE_1D:
-+	case GL_TEXTURE_2D:
-+	case GL_TEXTURE_3D:
-+	case GL_TEXTURE_CUBE_MAP:
-+		if (tObj->MinFilter == GL_NEAREST || tObj->MinFilter == GL_LINEAR) {
-+			/* GL_NEAREST and GL_LINEAR only care about GL_TEXTURE_BASE_LEVEL.
-+			*/
-+			firstLevel = lastLevel = tObj->BaseLevel;
-+		} else {
-+			firstLevel = tObj->BaseLevel + (GLint)(tObj->MinLod + 0.5);
-+			firstLevel = MAX2(firstLevel, tObj->BaseLevel);
-+			firstLevel = MIN2(firstLevel, level + baseImage->MaxLog2);
-+			lastLevel = tObj->BaseLevel + (GLint)(tObj->MaxLod + 0.5);
-+			lastLevel = MAX2(lastLevel, tObj->BaseLevel);
-+			lastLevel = MIN2(lastLevel, level + baseImage->MaxLog2);
-+			lastLevel = MIN2(lastLevel, tObj->MaxLevel);
-+			lastLevel = MAX2(firstLevel, lastLevel); /* need at least one level */
++
+ /*
+  * Note that all information needed to access pixels in a renderbuffer
+  * should be obtained through the gl_renderbuffer parameter, not per-context
+  * information.
+  */
+ #define LOCAL_VARS						\
+-   driRenderbuffer *drb = (driRenderbuffer *) rb;		\
+-   const __DRIdrawablePrivate *dPriv = drb->dPriv;		\
++   struct radeon_renderbuffer *rrb = (void *) rb;		\
++   const __DRIdrawablePrivate *dPriv = rrb->dPriv;		\
+    const GLuint bottom = dPriv->h - 1;				\
+-   GLubyte *buf = (GLubyte *) drb->flippedData			\
+-      + (dPriv->y * drb->flippedPitch + dPriv->x) * drb->cpp;	\
+-   GLuint p;							\
+-   (void) p;
++   GLuint p;						\
++   (void)p;
+ 
+ #define LOCAL_DEPTH_VARS				\
+-   driRenderbuffer *drb = (driRenderbuffer *) rb;	\
+-   const __DRIdrawablePrivate *dPriv = drb->dPriv;	\
+-   const GLuint bottom = dPriv->h - 1;			\
+-   GLuint xo = dPriv->x;				\
+-   GLuint yo = dPriv->y;				\
+-   GLubyte *buf = (GLubyte *) drb->Base.Data;
++   struct radeon_renderbuffer *rrb = (void *) rb;	\
++   const __DRIdrawablePrivate *dPriv = rrb->dPriv;	\
++   const GLuint bottom = dPriv->h - 1;
+ 
+ #define LOCAL_STENCIL_VARS LOCAL_DEPTH_VARS
+ 
+@@ -94,7 +225,7 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ #define TAG(x)    radeon##x##_RGB565
+ #define TAG2(x,y) radeon##x##_RGB565##y
+-#define GET_PTR(X,Y) (buf + ((Y) * drb->flippedPitch + (X)) * 2)
++#define GET_PTR(X,Y) radeon_ptr16(rrb, (X), (Y))
+ #include "spantmp2.h"
+ 
+ /* 32 bit, ARGB8888 color spanline and pixel functions
+@@ -104,7 +235,7 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ #define TAG(x)    radeon##x##_ARGB8888
+ #define TAG2(x,y) radeon##x##_ARGB8888##y
+-#define GET_PTR(X,Y) (buf + ((Y) * drb->flippedPitch + (X)) * 4)
++#define GET_PTR(X,Y) radeon_ptr32(rrb, (X), (Y))
+ #include "spantmp2.h"
+ 
+ /* ================================================================
+@@ -121,65 +252,15 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+  * too...
+  */
+ 
+-static GLuint radeon_mba_z32(const driRenderbuffer * drb, GLint x, GLint y)
+-{
+-	GLuint pitch = drb->pitch;
+-	if (drb->depthHasSurface) {
+-		return 4 * (x + y * pitch);
+-	} else {
+-		GLuint ba, address = 0;	/* a[0..1] = 0           */
+-
+-#ifdef COMPILE_R300
+-		ba = (y / 8) * (pitch / 8) + (x / 8);
+-#else
+-		ba = (y / 16) * (pitch / 16) + (x / 16);
+-#endif
+-
+-		address |= (x & 0x7) << 2;	/* a[2..4] = x[0..2]     */
+-		address |= (y & 0x3) << 5;	/* a[5..6] = y[0..1]     */
+-		address |= (((x & 0x10) >> 2) ^ (y & 0x4)) << 5;	/* a[7]    = x[4] ^ y[2] */
+-		address |= (ba & 0x3) << 8;	/* a[8..9] = ba[0..1]    */
+-
+-		address |= (y & 0x8) << 7;	/* a[10]   = y[3]        */
+-		address |= (((x & 0x8) << 1) ^ (y & 0x10)) << 7;	/* a[11]   = x[3] ^ y[4] */
+-		address |= (ba & ~0x3) << 10;	/* a[12..] = ba[2..]     */
+-
+-		return address;
+-	}
+-}
+-
+-static INLINE GLuint
+-radeon_mba_z16(const driRenderbuffer * drb, GLint x, GLint y)
+-{
+-	GLuint pitch = drb->pitch;
+-	if (drb->depthHasSurface) {
+-		return 2 * (x + y * pitch);
+-	} else {
+-		GLuint ba, address = 0;	/* a[0]    = 0           */
+-
+-		ba = (y / 16) * (pitch / 32) + (x / 32);
+-
+-		address |= (x & 0x7) << 1;	/* a[1..3] = x[0..2]     */
+-		address |= (y & 0x7) << 4;	/* a[4..6] = y[0..2]     */
+-		address |= (x & 0x8) << 4;	/* a[7]    = x[3]        */
+-		address |= (ba & 0x3) << 8;	/* a[8..9] = ba[0..1]    */
+-		address |= (y & 0x8) << 7;	/* a[10]   = y[3]        */
+-		address |= ((x & 0x10) ^ (y & 0x10)) << 7;	/* a[11]   = x[4] ^ y[4] */
+-		address |= (ba & ~0x3) << 10;	/* a[12..] = ba[2..]     */
+-
+-		return address;
+-	}
+-}
+-
+ /* 16-bit depth buffer functions
+  */
+ #define VALUE_TYPE GLushort
+ 
+ #define WRITE_DEPTH( _x, _y, d )					\
+-   *(GLushort *)(buf + radeon_mba_z16( drb, _x + xo, _y + yo )) = d;
++   *(GLushort *)radeon_ptr(rrb, _x, _y) = d
+ 
+ #define READ_DEPTH( d, _x, _y )						\
+-   d = *(GLushort *)(buf + radeon_mba_z16( drb, _x + xo, _y + yo ));
++   d = *(GLushort *)radeon_ptr(rrb, _x, _y)
+ 
+ #define TAG(x) radeon##x##_z16
+ #include "depthtmp.h"
+@@ -194,35 +275,36 @@ radeon_mba_z16(const driRenderbuffer * drb, GLint x, GLint y)
+ #ifdef COMPILE_R300
+ #define WRITE_DEPTH( _x, _y, d )					\
+ do {									\
+-   GLuint offset = radeon_mba_z32( drb, _x + xo, _y + yo );		\
+-   GLuint tmp = *(GLuint *)(buf + offset);				\
++   GLuint *_ptr = (GLuint*)radeon_ptr32( rrb, _x, _y );		\
++   GLuint tmp = *_ptr;				\
+    tmp &= 0x000000ff;							\
+    tmp |= ((d << 8) & 0xffffff00);					\
+-   *(GLuint *)(buf + offset) = tmp;					\
++   *_ptr = tmp;					\
+ } while (0)
+ #else
+ #define WRITE_DEPTH( _x, _y, d )					\
+ do {									\
+-   GLuint offset = radeon_mba_z32( drb, _x + xo, _y + yo );		\
+-   GLuint tmp = *(GLuint *)(buf + offset);				\
++   GLuint *_ptr = (GLuint*)radeon_ptr32( rrb, _x, _y );		\
++   GLuint tmp = *_ptr;				\
+    tmp &= 0xff000000;							\
+    tmp |= ((d) & 0x00ffffff);						\
+-   *(GLuint *)(buf + offset) = tmp;					\
++   *_ptr = tmp;					\
+ } while (0)
+ #endif
+ 
+ #ifdef COMPILE_R300
+ #define READ_DEPTH( d, _x, _y )						\
+   do { \
+-    d = (*(GLuint *)(buf + radeon_mba_z32( drb, _x + xo,		\
+-					 _y + yo )) & 0xffffff00) >> 8; \
++    d = (*(GLuint*)(radeon_ptr32(rrb, _x, _y)) & 0xffffff00) >> 8; \
+   }while(0)
+ #else
+ #define READ_DEPTH( d, _x, _y )						\
+-   d = *(GLuint *)(buf + radeon_mba_z32( drb, _x + xo,			\
+-					 _y + yo )) & 0x00ffffff;
++   d = *(GLuint*)(radeon_ptr32(rrb, _x,	_y )) & 0x00ffffff;
+ #endif
+-
++/*
++    fprintf(stderr, "dval(%d, %d, %d, %d)=0x%08X\n", _x, xo, _y, yo, d);\
++   d = *(GLuint*)(radeon_ptr(rrb, _x,	_y )) & 0x00ffffff;
++*/
+ #define TAG(x) radeon##x##_z24_s8
+ #include "depthtmp.h"
+ 
+@@ -235,35 +317,35 @@ do {									\
+ #ifdef COMPILE_R300
+ #define WRITE_STENCIL( _x, _y, d )					\
+ do {									\
+-   GLuint offset = radeon_mba_z32( drb, _x + xo, _y + yo );		\
+-   GLuint tmp = *(GLuint *)(buf + offset);				\
++   GLuint *_ptr = (GLuint*)radeon_ptr32(rrb, _x, _y);		\
++   GLuint tmp = *_ptr;				\
+    tmp &= 0xffffff00;							\
+    tmp |= (d) & 0xff;							\
+-   *(GLuint *)(buf + offset) = tmp;					\
++   *_ptr = tmp;					\
+ } while (0)
+ #else
+ #define WRITE_STENCIL( _x, _y, d )					\
+ do {									\
+-   GLuint offset = radeon_mba_z32( drb, _x + xo, _y + yo );		\
+-   GLuint tmp = *(GLuint *)(buf + offset);				\
++   GLuint *_ptr = (GLuint*)radeon_ptr32(rrb, _x, _y);		\
++   GLuint tmp = *_ptr;				\
+    tmp &= 0x00ffffff;							\
+    tmp |= (((d) & 0xff) << 24);						\
+-   *(GLuint *)(buf + offset) = tmp;					\
++   *_ptr = tmp;					\
+ } while (0)
+ #endif
+ 
+ #ifdef COMPILE_R300
+ #define READ_STENCIL( d, _x, _y )					\
+ do {									\
+-   GLuint offset = radeon_mba_z32( drb, _x + xo, _y + yo );		\
+-   GLuint tmp = *(GLuint *)(buf + offset);				\
++   GLuint *_ptr = (GLuint*)radeon_ptr32( rrb, _x, _y );		\
++   GLuint tmp = *_ptr;				\
+    d = tmp & 0x000000ff;						\
+ } while (0)
+ #else
+ #define READ_STENCIL( d, _x, _y )					\
+ do {									\
+-   GLuint offset = radeon_mba_z32( drb, _x + xo, _y + yo );		\
+-   GLuint tmp = *(GLuint *)(buf + offset);				\
++   GLuint *_ptr = (GLuint*)radeon_ptr32( rrb, _x, _y );		\
++   GLuint tmp = *_ptr;				\
+    d = (tmp & 0xff000000) >> 24;					\
+ } while (0)
+ #endif
+@@ -271,20 +353,60 @@ do {									\
+ #define TAG(x) radeon##x##_z24_s8
+ #include "stenciltmp.h"
+ 
+-/* Move locking out to get reasonable span performance (10x better
+- * than doing this in HW_LOCK above).  WaitForIdle() is the main
+- * culprit.
+- */
++
++static void map_buffer(struct gl_renderbuffer *rb, GLboolean write)
++{
++	struct radeon_renderbuffer *rrb = (void*)rb;
++	int r;
++	
++	if (rrb->bo) {
++		r = radeon_bo_map(rrb->bo, write);
++		if (r) {
++			fprintf(stderr, "(%s) error(%d) mapping buffer.\n",
++				__FUNCTION__, r);
 +		}
-+		break;
-+	case GL_TEXTURE_RECTANGLE_NV:
-+	case GL_TEXTURE_4D_SGIS:
-+		firstLevel = lastLevel = 0;
-+		break;
-+	default:
-+		return;
 +	}
-+
-+	/* save these values */
-+	*pfirstLevel = firstLevel;
-+	*plastLevel = lastLevel;
 +}
 +
-+
-+/**
-+ * Checks whether the given miptree can hold the given texture image at the
-+ * given face and level.
-+ */
-+GLboolean radeon_miptree_matches_image(radeon_mipmap_tree *mt,
-+		struct gl_texture_image *texImage, GLuint face, GLuint level)
++static void unmap_buffer(struct gl_renderbuffer *rb)
 +{
-+	radeon_mipmap_level *lvl;
++	struct radeon_renderbuffer *rrb = (void*)rb;
 +
-+	if (face >= mt->faces || level < mt->firstLevel || level > mt->lastLevel)
-+		return GL_FALSE;
++	if (rrb->bo) {
++		radeon_bo_unmap(rrb->bo);
++	}
++}
+ 
+ static void radeonSpanRenderStart(GLcontext * ctx)
+ {
+ 	radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+-#ifdef COMPILE_R300
+-	r300ContextPtr r300 = (r300ContextPtr) rmesa;
+-	R300_FIREVERTICES(r300);
+-#else
+-	RADEON_FIREVERTICES(rmesa);
+-#endif
++	int i;
 +
-+	if (texImage->IsCompressed != mt->compressed)
-+		return GL_FALSE;
++	radeon_firevertices(rmesa);
 +
-+	if (!texImage->IsCompressed &&
-+	    !mt->compressed &&
-+	    texImage->TexFormat->TexelBytes != mt->bpp)
-+		return GL_FALSE;
++	for (i = 0; i < ctx->Const.MaxTextureImageUnits; i++) {
++		if (ctx->Texture.Unit[i]._ReallyEnabled)
++			ctx->Driver.MapTexture(ctx, ctx->Texture.Unit[i]._Current);
++	}
 +
-+	lvl = &mt->levels[level - mt->firstLevel];
-+	if (lvl->width != texImage->Width ||
-+	    lvl->height != texImage->Height ||
-+	    lvl->depth != texImage->Depth)
-+		return GL_FALSE;
++	/* color draw buffers */
++	for (i = 0; i < ctx->DrawBuffer->_NumColorDrawBuffers; i++) {
++		map_buffer(ctx->DrawBuffer->_ColorDrawBuffers[i], GL_TRUE);
++	}
 +
-+	return GL_TRUE;
-+}
++	map_buffer(ctx->ReadBuffer->_ColorReadBuffer, GL_FALSE);
++
++	if (ctx->DrawBuffer->_DepthBuffer) {
++		map_buffer(ctx->DrawBuffer->_DepthBuffer->Wrapped, GL_TRUE);
++	}
++	if (ctx->DrawBuffer->_StencilBuffer)
++		map_buffer(ctx->DrawBuffer->_StencilBuffer->Wrapped, GL_TRUE);
++
++	/* The locking and wait for idle should really only be needed in classic mode.
++	 * In a future memory manager based implementation, this should become
++	 * unnecessary due to the fact that mapping our buffers, textures, etc.
++	 * should implicitly wait for any previous rendering commands that must
++	 * be waited on. */
+ 	LOCK_HARDWARE(rmesa);
+ 	radeonWaitForIdleLocked(rmesa);
+ }
+@@ -292,8 +414,25 @@ static void radeonSpanRenderStart(GLcontext * ctx)
+ static void radeonSpanRenderFinish(GLcontext * ctx)
+ {
+ 	radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
++	int i;
+ 	_swrast_flush(ctx);
+ 	UNLOCK_HARDWARE(rmesa);
 +
++	for (i = 0; i < ctx->Const.MaxTextureImageUnits; i++) {
++		if (ctx->Texture.Unit[i]._ReallyEnabled)
++			ctx->Driver.UnmapTexture(ctx, ctx->Texture.Unit[i]._Current);
++	}
 +
-+/**
-+ * Checks whether the given miptree has the right format to store the given texture object.
++	/* color draw buffers */
++	for (i = 0; i < ctx->DrawBuffer->_NumColorDrawBuffers; i++)
++		unmap_buffer(ctx->DrawBuffer->_ColorDrawBuffers[i]);
++
++	unmap_buffer(ctx->ReadBuffer->_ColorReadBuffer);
++
++	if (ctx->DrawBuffer->_DepthBuffer)
++		unmap_buffer(ctx->DrawBuffer->_DepthBuffer->Wrapped);
++	if (ctx->DrawBuffer->_StencilBuffer)
++		unmap_buffer(ctx->DrawBuffer->_StencilBuffer->Wrapped);
+ }
+ 
+ void radeonInitSpanFuncs(GLcontext * ctx)
+@@ -307,20 +446,17 @@ void radeonInitSpanFuncs(GLcontext * ctx)
+ /**
+  * Plug in the Get/Put routines for the given driRenderbuffer.
+  */
+-void radeonSetSpanFunctions(driRenderbuffer * drb, const GLvisual * vis)
++void radeonSetSpanFunctions(struct radeon_renderbuffer *rrb)
+ {
+-	if (drb->Base.InternalFormat == GL_RGBA) {
+-		if (vis->redBits == 5 && vis->greenBits == 6
+-		    && vis->blueBits == 5) {
+-			radeonInitPointers_RGB565(&drb->Base);
+-		} else {
+-			radeonInitPointers_ARGB8888(&drb->Base);
+-		}
+-	} else if (drb->Base.InternalFormat == GL_DEPTH_COMPONENT16) {
+-		radeonInitDepthPointers_z16(&drb->Base);
+-	} else if (drb->Base.InternalFormat == GL_DEPTH_COMPONENT24) {
+-		radeonInitDepthPointers_z24_s8(&drb->Base);
+-	} else if (drb->Base.InternalFormat == GL_STENCIL_INDEX8_EXT) {
+-		radeonInitStencilPointers_z24_s8(&drb->Base);
++	if (rrb->base.InternalFormat == GL_RGB5) {
++		radeonInitPointers_RGB565(&rrb->base);
++	} else if (rrb->base.InternalFormat == GL_RGBA8) {
++		radeonInitPointers_ARGB8888(&rrb->base);
++	} else if (rrb->base.InternalFormat == GL_DEPTH_COMPONENT16) {
++		radeonInitDepthPointers_z16(&rrb->base);
++	} else if (rrb->base.InternalFormat == GL_DEPTH_COMPONENT24) {
++		radeonInitDepthPointers_z24_s8(&rrb->base);
++	} else if (rrb->base.InternalFormat == GL_STENCIL_INDEX8_EXT) {
++		radeonInitStencilPointers_z24_s8(&rrb->base);
+ 	}
+ }
+diff --git a/src/mesa/drivers/dri/radeon/radeon_span.h b/src/mesa/drivers/dri/radeon/radeon_span.h
+index 9abe086..dd44ab5 100644
+--- a/src/mesa/drivers/dri/radeon/radeon_span.h
++++ b/src/mesa/drivers/dri/radeon/radeon_span.h
+@@ -42,9 +42,7 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ #ifndef __RADEON_SPAN_H__
+ #define __RADEON_SPAN_H__
+ 
+-#include "drirenderbuffer.h"
+-
+ extern void radeonInitSpanFuncs(GLcontext * ctx);
+-extern void radeonSetSpanFunctions(driRenderbuffer * rb, const GLvisual * vis);
+ 
++extern void radeonSetSpanFunctions(struct radeon_renderbuffer *rrb);
+ #endif
+diff --git a/src/mesa/drivers/dri/radeon/radeon_state.c b/src/mesa/drivers/dri/radeon/radeon_state.c
+index 32bcff3..5fffa28 100644
+--- a/src/mesa/drivers/dri/radeon/radeon_state.c
++++ b/src/mesa/drivers/dri/radeon/radeon_state.c
+@@ -62,7 +62,7 @@ static void radeonUpdateSpecular( GLcontext *ctx );
+ 
+ static void radeonAlphaFunc( GLcontext *ctx, GLenum func, GLfloat ref )
+ {
+-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
++   r100ContextPtr rmesa = R100_CONTEXT(ctx);
+    int pp_misc = rmesa->hw.ctx.cmd[CTX_PP_MISC];
+    GLubyte refByte;
+ 
+@@ -106,7 +106,7 @@ static void radeonAlphaFunc( GLcontext *ctx, GLenum func, GLfloat ref )
+ static void radeonBlendEquationSeparate( GLcontext *ctx,
+ 					 GLenum modeRGB, GLenum modeA )
+ {
+-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
++   r100ContextPtr rmesa = R100_CONTEXT(ctx);
+    GLuint b = rmesa->hw.ctx.cmd[CTX_RB3D_BLENDCNTL] & ~RADEON_COMB_FCN_MASK;
+    GLboolean fallback = GL_FALSE;
+ 
+@@ -147,7 +147,7 @@ static void radeonBlendFuncSeparate( GLcontext *ctx,
+ 				     GLenum sfactorRGB, GLenum dfactorRGB,
+ 				     GLenum sfactorA, GLenum dfactorA )
+ {
+-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
++   r100ContextPtr rmesa = R100_CONTEXT(ctx);
+    GLuint b = rmesa->hw.ctx.cmd[CTX_RB3D_BLENDCNTL] & 
+       ~(RADEON_SRC_BLEND_MASK | RADEON_DST_BLEND_MASK);
+    GLboolean fallback = GL_FALSE;
+@@ -257,7 +257,7 @@ static void radeonBlendFuncSeparate( GLcontext *ctx,
+ 
+ static void radeonDepthFunc( GLcontext *ctx, GLenum func )
+ {
+-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
++   r100ContextPtr rmesa = R100_CONTEXT(ctx);
+ 
+    RADEON_STATECHANGE( rmesa, ctx );
+    rmesa->hw.ctx.cmd[CTX_RB3D_ZSTENCILCNTL] &= ~RADEON_Z_TEST_MASK;
+@@ -293,7 +293,7 @@ static void radeonDepthFunc( GLcontext *ctx, GLenum func )
+ 
+ static void radeonDepthMask( GLcontext *ctx, GLboolean flag )
+ {
+-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
++   r100ContextPtr rmesa = R100_CONTEXT(ctx);
+    RADEON_STATECHANGE( rmesa, ctx );
+ 
+    if ( ctx->Depth.Mask ) {
+@@ -305,16 +305,16 @@ static void radeonDepthMask( GLcontext *ctx, GLboolean flag )
+ 
+ static void radeonClearDepth( GLcontext *ctx, GLclampd d )
+ {
+-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
++   r100ContextPtr rmesa = R100_CONTEXT(ctx);
+    GLuint format = (rmesa->hw.ctx.cmd[CTX_RB3D_ZSTENCILCNTL] &
+ 		    RADEON_DEPTH_FORMAT_MASK);
+ 
+    switch ( format ) {
+    case RADEON_DEPTH_FORMAT_16BIT_INT_Z:
+-      rmesa->state.depth.clear = d * 0x0000ffff;
++      rmesa->radeon.state.depth.clear = d * 0x0000ffff;
+       break;
+    case RADEON_DEPTH_FORMAT_24BIT_INT_Z:
+-      rmesa->state.depth.clear = d * 0x00ffffff;
++      rmesa->radeon.state.depth.clear = d * 0x00ffffff;
+       break;
+    }
+ }
+@@ -327,7 +327,7 @@ static void radeonClearDepth( GLcontext *ctx, GLclampd d )
+ 
+ static void radeonFogfv( GLcontext *ctx, GLenum pname, const GLfloat *param )
+ {
+-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
++   r100ContextPtr rmesa = R100_CONTEXT(ctx);
+    union { int i; float f; } c, d;
+    GLchan col[4];
+ 
+@@ -406,109 +406,13 @@ static void radeonFogfv( GLcontext *ctx, GLenum pname, const GLfloat *param )
+    }
+ }
+ 
+-
+-/* =============================================================
+- * Scissoring
+- */
+-
+-
+-static GLboolean intersect_rect( drm_clip_rect_t *out,
+-				 drm_clip_rect_t *a,
+-				 drm_clip_rect_t *b )
+-{
+-   *out = *a;
+-   if ( b->x1 > out->x1 ) out->x1 = b->x1;
+-   if ( b->y1 > out->y1 ) out->y1 = b->y1;
+-   if ( b->x2 < out->x2 ) out->x2 = b->x2;
+-   if ( b->y2 < out->y2 ) out->y2 = b->y2;
+-   if ( out->x1 >= out->x2 ) return GL_FALSE;
+-   if ( out->y1 >= out->y2 ) return GL_FALSE;
+-   return GL_TRUE;
+-}
+-
+-
+-void radeonRecalcScissorRects( radeonContextPtr rmesa )
+-{
+-   drm_clip_rect_t *out;
+-   int i;
+-
+-   /* Grow cliprect store?
+-    */
+-   if (rmesa->state.scissor.numAllocedClipRects < rmesa->numClipRects) {
+-      while (rmesa->state.scissor.numAllocedClipRects < rmesa->numClipRects) {
+-	 rmesa->state.scissor.numAllocedClipRects += 1;	/* zero case */
+-	 rmesa->state.scissor.numAllocedClipRects *= 2;
+-      }
+-
+-      if (rmesa->state.scissor.pClipRects)
+-	 FREE(rmesa->state.scissor.pClipRects);
+-
+-      rmesa->state.scissor.pClipRects = 
+-	 MALLOC( rmesa->state.scissor.numAllocedClipRects * 
+-		 sizeof(drm_clip_rect_t) );
+-
+-      if ( rmesa->state.scissor.pClipRects == NULL ) {
+-	 rmesa->state.scissor.numAllocedClipRects = 0;
+-	 return;
+-      }
+-   }
+-   
+-   out = rmesa->state.scissor.pClipRects;
+-   rmesa->state.scissor.numClipRects = 0;
+-
+-   for ( i = 0 ; i < rmesa->numClipRects ;  i++ ) {
+-      if ( intersect_rect( out, 
+-			   &rmesa->pClipRects[i], 
+-			   &rmesa->state.scissor.rect ) ) {
+-	 rmesa->state.scissor.numClipRects++;
+-	 out++;
+-      }
+-   }
+-}
+-
+-
+-static void radeonUpdateScissor( GLcontext *ctx )
+-{
+-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+-
+-   if ( rmesa->dri.drawable ) {
+-      __DRIdrawablePrivate *dPriv = rmesa->dri.drawable;
+-
+-      int x = ctx->Scissor.X;
+-      int y = dPriv->h - ctx->Scissor.Y - ctx->Scissor.Height;
+-      int w = ctx->Scissor.X + ctx->Scissor.Width - 1;
+-      int h = dPriv->h - ctx->Scissor.Y - 1;
+-
+-      rmesa->state.scissor.rect.x1 = x + dPriv->x;
+-      rmesa->state.scissor.rect.y1 = y + dPriv->y;
+-      rmesa->state.scissor.rect.x2 = w + dPriv->x + 1;
+-      rmesa->state.scissor.rect.y2 = h + dPriv->y + 1;
+-
+-      radeonRecalcScissorRects( rmesa );
+-   }
+-}
+-
+-
+-static void radeonScissor( GLcontext *ctx,
+-			   GLint x, GLint y, GLsizei w, GLsizei h )
+-{
+-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+-
+-   if ( ctx->Scissor.Enabled ) {
+-      RADEON_FIREVERTICES( rmesa );	/* don't pipeline cliprect changes */
+-      radeonUpdateScissor( ctx );
+-   }
+-
+-}
+-
+-
+ /* =============================================================
+  * Culling
+  */
+ 
+ static void radeonCullFace( GLcontext *ctx, GLenum unused )
+ {
+-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
++   r100ContextPtr rmesa = R100_CONTEXT(ctx);
+    GLuint s = rmesa->hw.set.cmd[SET_SE_CNTL];
+    GLuint t = rmesa->hw.tcl.cmd[TCL_UCP_VERT_BLEND_CTL];
+ 
+@@ -545,7 +449,7 @@ static void radeonCullFace( GLcontext *ctx, GLenum unused )
+ 
+ static void radeonFrontFace( GLcontext *ctx, GLenum mode )
+ {
+-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
++   r100ContextPtr rmesa = R100_CONTEXT(ctx);
+ 
+    RADEON_STATECHANGE( rmesa, set );
+    rmesa->hw.set.cmd[SET_SE_CNTL] &= ~RADEON_FFACE_CULL_DIR_MASK;
+@@ -570,7 +474,7 @@ static void radeonFrontFace( GLcontext *ctx, GLenum mode )
+  */
+ static void radeonLineWidth( GLcontext *ctx, GLfloat widthf )
+ {
+-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
++   r100ContextPtr rmesa = R100_CONTEXT(ctx);
+ 
+    RADEON_STATECHANGE( rmesa, lin );
+    RADEON_STATECHANGE( rmesa, set );
+@@ -587,7 +491,7 @@ static void radeonLineWidth( GLcontext *ctx, GLfloat widthf )
+ 
+ static void radeonLineStipple( GLcontext *ctx, GLint factor, GLushort pattern )
+ {
+-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
++   r100ContextPtr rmesa = R100_CONTEXT(ctx);
+ 
+    RADEON_STATECHANGE( rmesa, lin );
+    rmesa->hw.lin.cmd[LIN_RE_LINE_PATTERN] = 
+@@ -602,8 +506,8 @@ static void radeonColorMask( GLcontext *ctx,
+ 			     GLboolean r, GLboolean g,
+ 			     GLboolean b, GLboolean a )
+ {
+-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+-   GLuint mask = radeonPackColor( rmesa->radeonScreen->cpp,
++   r100ContextPtr rmesa = R100_CONTEXT(ctx);
++   GLuint mask = radeonPackColor( rmesa->radeon.radeonScreen->cpp,
+ 				  ctx->Color.ColorMask[RCOMP],
+ 				  ctx->Color.ColorMask[GCOMP],
+ 				  ctx->Color.ColorMask[BCOMP],
+@@ -623,8 +527,8 @@ static void radeonColorMask( GLcontext *ctx,
+ static void radeonPolygonOffset( GLcontext *ctx,
+ 				 GLfloat factor, GLfloat units )
+ {
+-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+-   float_ui32_type constant =  { units * rmesa->state.depth.scale };
++   r100ContextPtr rmesa = R100_CONTEXT(ctx);
++   float_ui32_type constant =  { units * rmesa->radeon.state.depth.scale };
+    float_ui32_type factoru = { factor };
+ 
+    RADEON_STATECHANGE( rmesa, zbs );
+@@ -634,7 +538,7 @@ static void radeonPolygonOffset( GLcontext *ctx,
+ 
+ static void radeonPolygonStipple( GLcontext *ctx, const GLubyte *mask )
+ {
+-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
++   r100ContextPtr rmesa = R100_CONTEXT(ctx);
+    GLuint i;
+    drm_radeon_stipple_t stipple;
+ 
+@@ -646,27 +550,27 @@ static void radeonPolygonStipple( GLcontext *ctx, const GLubyte *mask )
+ 
+    /* TODO: push this into cmd mechanism
+     */
+-   RADEON_FIREVERTICES( rmesa );
+-   LOCK_HARDWARE( rmesa );
++   radeon_firevertices(&rmesa->radeon);
++   LOCK_HARDWARE( &rmesa->radeon );
+ 
+    /* FIXME: Use window x,y offsets into stipple RAM.
+     */
+    stipple.mask = rmesa->state.stipple.mask;
+-   drmCommandWrite( rmesa->dri.fd, DRM_RADEON_STIPPLE, 
++   drmCommandWrite( rmesa->radeon.dri.fd, DRM_RADEON_STIPPLE, 
+                     &stipple, sizeof(drm_radeon_stipple_t) );
+-   UNLOCK_HARDWARE( rmesa );
++   UNLOCK_HARDWARE( &rmesa->radeon );
+ }
+ 
+ static void radeonPolygonMode( GLcontext *ctx, GLenum face, GLenum mode )
+ {
+-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
++   r100ContextPtr rmesa = R100_CONTEXT(ctx);
+    GLboolean flag = (ctx->_TriangleCaps & DD_TRI_UNFILLED) != 0;
+ 
+    /* Can't generally do unfilled via tcl, but some good special
+     * cases work. 
+     */
+    TCL_FALLBACK( ctx, RADEON_TCL_FALLBACK_UNFILLED, flag);
+-   if (rmesa->TclFallback) {
++   if (rmesa->radeon.TclFallback) {
+       radeonChooseRenderState( ctx );
+       radeonChooseVertexState( ctx );
+    }
+@@ -686,7 +590,7 @@ static void radeonPolygonMode( GLcontext *ctx, GLenum face, GLenum mode )
+  */
+ static void radeonUpdateSpecular( GLcontext *ctx )
+ {
+-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
++   r100ContextPtr rmesa = R100_CONTEXT(ctx);
+    uint32_t p = rmesa->hw.ctx.cmd[CTX_PP_CNTL];
+    GLuint flag = 0;
+ 
+@@ -757,7 +661,7 @@ static void radeonUpdateSpecular( GLcontext *ctx )
+ 
+    /* Update vertex/render formats
+     */
+-   if (rmesa->TclFallback) { 
++   if (rmesa->radeon.TclFallback) { 
+       radeonChooseRenderState( ctx );
+       radeonChooseVertexState( ctx );
+    }
+@@ -774,7 +678,7 @@ static void radeonUpdateSpecular( GLcontext *ctx )
+  */
+ static void update_global_ambient( GLcontext *ctx )
+ {
+-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
++   r100ContextPtr rmesa = R100_CONTEXT(ctx);
+    float *fcmd = (float *)RADEON_DB_STATE( glt );
+ 
+    /* Need to do more if both emmissive & ambient are PREMULT:
+@@ -809,7 +713,7 @@ static void update_light_colors( GLcontext *ctx, GLuint p )
+ /*     fprintf(stderr, "%s\n", __FUNCTION__); */
+ 
+    if (l->Enabled) {
+-      radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
++      r100ContextPtr rmesa = R100_CONTEXT(ctx);
+       float *fcmd = (float *)RADEON_DB_STATE( lit[p] );
+ 
+       COPY_4V( &fcmd[LIT_AMBIENT_RED], l->Ambient );	 
+@@ -849,7 +753,7 @@ static void check_twoside_fallback( GLcontext *ctx )
+ 
+ static void radeonColorMaterial( GLcontext *ctx, GLenum face, GLenum mode )
+ {
+-      radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
++      r100ContextPtr rmesa = R100_CONTEXT(ctx);
+       GLuint light_model_ctl1 = rmesa->hw.tcl.cmd[TCL_LIGHT_MODEL_CTL];
+ 
+       light_model_ctl1 &= ~((3 << RADEON_EMISSIVE_SOURCE_SHIFT) |
+@@ -913,7 +817,7 @@ static void radeonColorMaterial( GLcontext *ctx, GLenum face, GLenum mode )
+ 
+ void radeonUpdateMaterial( GLcontext *ctx )
+ {
+-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
++   r100ContextPtr rmesa = R100_CONTEXT(ctx);
+    GLfloat (*mat)[4] = ctx->Light.Material.Attrib;
+    GLfloat *fcmd = (GLfloat *)RADEON_DB_STATE( mtl );
+    GLuint mask = ~0;
+@@ -978,7 +882,7 @@ void radeonUpdateMaterial( GLcontext *ctx )
+  */
+ static void update_light( GLcontext *ctx )
+ {
+-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
++   r100ContextPtr rmesa = R100_CONTEXT(ctx);
+ 
+    /* Have to check these, or have an automatic shortcircuit mechanism
+     * to remove noop statechanges. (Or just do a better job on the
+@@ -1043,7 +947,7 @@ static void update_light( GLcontext *ctx )
+ static void radeonLightfv( GLcontext *ctx, GLenum light,
+ 			   GLenum pname, const GLfloat *params )
+ {
+-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
++   r100ContextPtr rmesa = R100_CONTEXT(ctx);
+    GLint p = light - GL_LIGHT0;
+    struct gl_light *l = &ctx->Light.Light[p];
+    GLfloat *fcmd = (GLfloat *)rmesa->hw.lit[p].cmd;
+@@ -1164,7 +1068,7 @@ static void radeonLightfv( GLcontext *ctx, GLenum light,
+ static void radeonLightModelfv( GLcontext *ctx, GLenum pname,
+ 				const GLfloat *param )
+ {
+-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
++   r100ContextPtr rmesa = R100_CONTEXT(ctx);
+ 
+    switch (pname) {
+       case GL_LIGHT_MODEL_AMBIENT: 
+@@ -1188,7 +1092,7 @@ static void radeonLightModelfv( GLcontext *ctx, GLenum pname,
+ 
+ 	 check_twoside_fallback( ctx );
+ 
+-	 if (rmesa->TclFallback) {
++	 if (rmesa->radeon.TclFallback) {
+ 	    radeonChooseRenderState( ctx );
+ 	    radeonChooseVertexState( ctx );
+ 	 }
+@@ -1205,7 +1109,7 @@ static void radeonLightModelfv( GLcontext *ctx, GLenum pname,
+ 
+ static void radeonShadeModel( GLcontext *ctx, GLenum mode )
+ {
+-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
++   r100ContextPtr rmesa = R100_CONTEXT(ctx);
+    GLuint s = rmesa->hw.set.cmd[SET_SE_CNTL];
+ 
+    s &= ~(RADEON_DIFFUSE_SHADE_MASK |
+@@ -1244,7 +1148,7 @@ static void radeonShadeModel( GLcontext *ctx, GLenum mode )
+ static void radeonClipPlane( GLcontext *ctx, GLenum plane, const GLfloat *eq )
+ {
+    GLint p = (GLint) plane - (GLint) GL_CLIP_PLANE0;
+-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
++   r100ContextPtr rmesa = R100_CONTEXT(ctx);
+    GLint *ip = (GLint *)ctx->Transform._ClipUserPlane[p];
+ 
+    RADEON_STATECHANGE( rmesa, ucp[p] );
+@@ -1256,7 +1160,7 @@ static void radeonClipPlane( GLcontext *ctx, GLenum plane, const GLfloat *eq )
+ 
+ static void radeonUpdateClipPlanes( GLcontext *ctx )
+ {
+-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
++   r100ContextPtr rmesa = R100_CONTEXT(ctx);
+    GLuint p;
+ 
+    for (p = 0; p < ctx->Const.MaxClipPlanes; p++) {
+@@ -1281,7 +1185,7 @@ static void
+ radeonStencilFuncSeparate( GLcontext *ctx, GLenum face, GLenum func,
+                            GLint ref, GLuint mask )
+ {
+-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
++   r100ContextPtr rmesa = R100_CONTEXT(ctx);
+    GLuint refmask = (((ctx->Stencil.Ref[0] & 0xff) << RADEON_STENCIL_REF_SHIFT) |
+ 		     ((ctx->Stencil.ValueMask[0] & 0xff) << RADEON_STENCIL_MASK_SHIFT));
+ 
+@@ -1325,7 +1229,7 @@ radeonStencilFuncSeparate( GLcontext *ctx, GLenum face, GLenum func,
+ static void
+ radeonStencilMaskSeparate( GLcontext *ctx, GLenum face, GLuint mask )
+ {
+-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
++   r100ContextPtr rmesa = R100_CONTEXT(ctx);
+ 
+    RADEON_STATECHANGE( rmesa, msk );
+    rmesa->hw.msk.cmd[MSK_RB3D_STENCILREFMASK] &= ~RADEON_STENCIL_WRITE_MASK;
+@@ -1336,7 +1240,7 @@ radeonStencilMaskSeparate( GLcontext *ctx, GLenum face, GLuint mask )
+ static void radeonStencilOpSeparate( GLcontext *ctx, GLenum face, GLenum fail,
+                                      GLenum zfail, GLenum zpass )
+ {
+-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
++   r100ContextPtr rmesa = R100_CONTEXT(ctx);
+ 
+    /* radeon 7200 have stencil bug, DEC and INC_WRAP will actually both do DEC_WRAP,
+       and DEC_WRAP (and INVERT) will do INVERT. No way to get correct INC_WRAP and DEC,
+@@ -1349,7 +1253,7 @@ static void radeonStencilOpSeparate( GLcontext *ctx, GLenum face, GLenum fail,
+    GLuint tempRADEON_STENCIL_ZPASS_DEC_WRAP;
+    GLuint tempRADEON_STENCIL_ZPASS_INC_WRAP;
+    
+-   if (rmesa->radeonScreen->chip_flags & RADEON_CHIPSET_BROKEN_STENCIL) {
++   if (rmesa->radeon.radeonScreen->chip_flags & RADEON_CHIPSET_BROKEN_STENCIL) {
+       tempRADEON_STENCIL_FAIL_DEC_WRAP = RADEON_STENCIL_FAIL_DEC;
+       tempRADEON_STENCIL_FAIL_INC_WRAP = RADEON_STENCIL_FAIL_INC;
+       tempRADEON_STENCIL_ZFAIL_DEC_WRAP = RADEON_STENCIL_ZFAIL_DEC;
+@@ -1455,9 +1359,9 @@ static void radeonStencilOpSeparate( GLcontext *ctx, GLenum face, GLenum fail,
+ 
+ static void radeonClearStencil( GLcontext *ctx, GLint s )
+ {
+-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
++   r100ContextPtr rmesa = R100_CONTEXT(ctx);
+ 
+-   rmesa->state.stencil.clear = 
++   rmesa->radeon.state.stencil.clear = 
+       ((GLuint) (ctx->Stencil.Clear & 0xff) |
+        (0xff << RADEON_STENCIL_MASK_SHIFT) |
+        ((ctx->Stencil.WriteMask[0] & 0xff) << RADEON_STENCIL_WRITEMASK_SHIFT));
+@@ -1481,20 +1385,20 @@ static void radeonClearStencil( GLcontext *ctx, GLint s )
+  */
+ void radeonUpdateWindow( GLcontext *ctx )
+ {
+-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+-   __DRIdrawablePrivate *dPriv = rmesa->dri.drawable;
+-   GLfloat xoffset = (GLfloat)dPriv->x;
+-   GLfloat yoffset = (GLfloat)dPriv->y + dPriv->h;
++   r100ContextPtr rmesa = R100_CONTEXT(ctx);
++   __DRIdrawablePrivate *dPriv = rmesa->radeon.dri.drawable;
++   GLfloat xoffset = dPriv ? (GLfloat) dPriv->x : 0;
++   GLfloat yoffset = dPriv ? (GLfloat) dPriv->y + dPriv->h : 0;
+    const GLfloat *v = ctx->Viewport._WindowMap.m;
+ 
+    float_ui32_type sx = { v[MAT_SX] };
+    float_ui32_type tx = { v[MAT_TX] + xoffset + SUBPIXEL_X };
+    float_ui32_type sy = { - v[MAT_SY] };
+    float_ui32_type ty = { (- v[MAT_TY]) + yoffset + SUBPIXEL_Y };
+-   float_ui32_type sz = { v[MAT_SZ] * rmesa->state.depth.scale };
+-   float_ui32_type tz = { v[MAT_TZ] * rmesa->state.depth.scale };
++   float_ui32_type sz = { v[MAT_SZ] * rmesa->radeon.state.depth.scale };
++   float_ui32_type tz = { v[MAT_TZ] * rmesa->radeon.state.depth.scale };
+ 
+-   RADEON_FIREVERTICES( rmesa );
++   radeon_firevertices(&rmesa->radeon);
+    RADEON_STATECHANGE( rmesa, vpt );
+ 
+    rmesa->hw.vpt.cmd[VPT_SE_VPORT_XSCALE]  = sx.ui32;
+@@ -1524,8 +1428,8 @@ static void radeonDepthRange( GLcontext *ctx, GLclampd nearval,
+ 
+ void radeonUpdateViewportOffset( GLcontext *ctx )
+ {
+-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+-   __DRIdrawablePrivate *dPriv = rmesa->dri.drawable;
++   r100ContextPtr rmesa = R100_CONTEXT(ctx);
++   __DRIdrawablePrivate *dPriv = rmesa->radeon.dri.drawable;
+    GLfloat xoffset = (GLfloat)dPriv->x;
+    GLfloat yoffset = (GLfloat)dPriv->y + dPriv->h;
+    const GLfloat *v = ctx->Viewport._WindowMap.m;
+@@ -1555,8 +1459,8 @@ void radeonUpdateViewportOffset( GLcontext *ctx )
+                 RADEON_STIPPLE_Y_OFFSET_MASK);
+ 
+          /* add magic offsets, then invert */
+-         stx = 31 - ((rmesa->dri.drawable->x - 1) & RADEON_STIPPLE_COORD_MASK);
+-         sty = 31 - ((rmesa->dri.drawable->y + rmesa->dri.drawable->h - 1)
++         stx = 31 - ((rmesa->radeon.dri.drawable->x - 1) & RADEON_STIPPLE_COORD_MASK);
++         sty = 31 - ((rmesa->radeon.dri.drawable->y + rmesa->radeon.dri.drawable->h - 1)
+                      & RADEON_STIPPLE_COORD_MASK);
+ 
+          m |= ((stx << RADEON_STIPPLE_X_OFFSET_SHIFT) |
+@@ -1580,20 +1484,20 @@ void radeonUpdateViewportOffset( GLcontext *ctx )
+ 
+ static void radeonClearColor( GLcontext *ctx, const GLfloat color[4] )
+ {
+-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
++   r100ContextPtr rmesa = R100_CONTEXT(ctx);
+    GLubyte c[4];
+    CLAMPED_FLOAT_TO_UBYTE(c[0], color[0]);
+    CLAMPED_FLOAT_TO_UBYTE(c[1], color[1]);
+    CLAMPED_FLOAT_TO_UBYTE(c[2], color[2]);
+    CLAMPED_FLOAT_TO_UBYTE(c[3], color[3]);
+-   rmesa->state.color.clear = radeonPackColor( rmesa->radeonScreen->cpp,
++   rmesa->radeon.state.color.clear = radeonPackColor( rmesa->radeon.radeonScreen->cpp,
+ 					       c[0], c[1], c[2], c[3] );
+ }
+ 
+ 
+ static void radeonRenderMode( GLcontext *ctx, GLenum mode )
+ {
+-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
++   r100ContextPtr rmesa = R100_CONTEXT(ctx);
+    FALLBACK( rmesa, RADEON_FALLBACK_RENDER_MODE, (mode != GL_RENDER) );
+ }
+ 
+@@ -1619,7 +1523,7 @@ static GLuint radeon_rop_tab[] = {
+ 
+ static void radeonLogicOpCode( GLcontext *ctx, GLenum opcode )
+ {
+-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
++   r100ContextPtr rmesa = R100_CONTEXT(ctx);
+    GLuint rop = (GLuint)opcode - GL_CLEAR;
+ 
+    ASSERT( rop < 16 );
+@@ -1630,66 +1534,17 @@ static void radeonLogicOpCode( GLcontext *ctx, GLenum opcode )
+ 
+ 
+ /**
+- * Set up the cliprects for either front or back-buffer drawing.
+- */
+-void radeonSetCliprects( radeonContextPtr rmesa )
+-{
+-   __DRIdrawablePrivate *const drawable = rmesa->dri.drawable;
+-   __DRIdrawablePrivate *const readable = rmesa->dri.readable;
+-   GLframebuffer *const draw_fb = (GLframebuffer*) drawable->driverPrivate;
+-   GLframebuffer *const read_fb = (GLframebuffer*) readable->driverPrivate;
+-
+-   if (draw_fb->_ColorDrawBufferIndexes[0] == BUFFER_BACK_LEFT) {
+-      /* Can't ignore 2d windows if we are page flipping.
+-       */
+-      if ( drawable->numBackClipRects == 0 || rmesa->doPageFlip ) {
+-	 rmesa->numClipRects = drawable->numClipRects;
+-	 rmesa->pClipRects = drawable->pClipRects;
+-      }
+-      else {
+-	 rmesa->numClipRects = drawable->numBackClipRects;
+-	 rmesa->pClipRects = drawable->pBackClipRects;
+-      }
+-   }
+-   else {
+-      /* front buffer (or none, or multiple buffers */
+-      rmesa->numClipRects = drawable->numClipRects;
+-      rmesa->pClipRects = drawable->pClipRects;
+-   }
+-
+-   if ((draw_fb->Width != drawable->w) || (draw_fb->Height != drawable->h)) {
+-      _mesa_resize_framebuffer(rmesa->glCtx, draw_fb,
+-			       drawable->w, drawable->h);
+-      draw_fb->Initialized = GL_TRUE;
+-   }
+-
+-   if (drawable != readable) {
+-      if ((read_fb->Width != readable->w) || (read_fb->Height != readable->h)) {
+-	 _mesa_resize_framebuffer(rmesa->glCtx, read_fb,
+-				  readable->w, readable->h);
+-	 read_fb->Initialized = GL_TRUE;
+-      }
+-   }
+-
+-   if (rmesa->state.scissor.enabled)
+-      radeonRecalcScissorRects( rmesa );
+-
+-   rmesa->lastStamp = drawable->lastStamp;
+-}
+-
+-
+-/**
+  * Called via glDrawBuffer.
+  */
+ static void radeonDrawBuffer( GLcontext *ctx, GLenum mode )
+ {
+-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
++   r100ContextPtr rmesa = R100_CONTEXT(ctx);
+ 
+    if (RADEON_DEBUG & DEBUG_DRI)
+       fprintf(stderr, "%s %s\n", __FUNCTION__,
+ 	      _mesa_lookup_enum_by_nr( mode ));
+ 
+-   RADEON_FIREVERTICES(rmesa);	/* don't pipeline cliprect changes */
++   radeon_firevertices(&rmesa->radeon);	/* don't pipeline cliprect changes */
+ 
+    if (ctx->DrawBuffer->_NumColorDrawBuffers != 1) {
+       /* 0 (GL_NONE) buffers or multiple color drawing buffers */
+@@ -1707,8 +1562,9 @@ static void radeonDrawBuffer( GLcontext *ctx, GLenum mode )
+       return;
+    }
+ 
+-   radeonSetCliprects( rmesa );
+-
++   radeonSetCliprects( &rmesa->radeon );
++   if (!rmesa->radeon.radeonScreen->driScreen->dri2.enabled)
++      radeonUpdatePageFlipping(&rmesa->radeon);
+    /* We'll set the drawing engine's offset/pitch parameters later
+     * when we update other state.
+     */
+@@ -1726,7 +1582,7 @@ static void radeonReadBuffer( GLcontext *ctx, GLenum mode )
+ 
+ static void radeonEnable( GLcontext *ctx, GLenum cap, GLboolean state )
+ {
+-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
++   r100ContextPtr rmesa = R100_CONTEXT(ctx);
+    GLuint p, flag;
+ 
+    if ( RADEON_DEBUG & DEBUG_STATE )
+@@ -1821,10 +1677,10 @@ static void radeonEnable( GLcontext *ctx, GLenum cap, GLboolean state )
+       RADEON_STATECHANGE(rmesa, ctx );
+       if ( state ) {
+ 	 rmesa->hw.ctx.cmd[CTX_RB3D_CNTL] |=  RADEON_DITHER_ENABLE;
+-	 rmesa->hw.ctx.cmd[CTX_RB3D_CNTL] &= ~rmesa->state.color.roundEnable;
++	 rmesa->hw.ctx.cmd[CTX_RB3D_CNTL] &= ~rmesa->radeon.state.color.roundEnable;
+       } else {
+ 	 rmesa->hw.ctx.cmd[CTX_RB3D_CNTL] &= ~RADEON_DITHER_ENABLE;
+-	 rmesa->hw.ctx.cmd[CTX_RB3D_CNTL] |=  rmesa->state.color.roundEnable;
++	 rmesa->hw.ctx.cmd[CTX_RB3D_CNTL] |=  rmesa->radeon.state.color.roundEnable;
+       }
+       break;
+ 
+@@ -1971,13 +1827,13 @@ static void radeonEnable( GLcontext *ctx, GLenum cap, GLboolean state )
+    }
+ 
+    case GL_SCISSOR_TEST:
+-      RADEON_FIREVERTICES( rmesa );
+-      rmesa->state.scissor.enabled = state;
++      radeon_firevertices(&rmesa->radeon);
++      rmesa->radeon.state.scissor.enabled = state;
+       radeonUpdateScissor( ctx );
+       break;
+ 
+    case GL_STENCIL_TEST:
+-      if ( rmesa->state.stencil.hwBuffer ) {
++      if ( rmesa->radeon.state.stencil.hwBuffer ) {
+ 	 RADEON_STATECHANGE( rmesa, ctx );
+ 	 if ( state ) {
+ 	    rmesa->hw.ctx.cmd[CTX_RB3D_CNTL] |=  RADEON_STENCIL_ENABLE;
+@@ -2010,7 +1866,7 @@ static void radeonEnable( GLcontext *ctx, GLenum cap, GLboolean state )
+ 
+ static void radeonLightingSpaceChange( GLcontext *ctx )
+ {
+-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
++   r100ContextPtr rmesa = R100_CONTEXT(ctx);
+    GLboolean tmp;
+    RADEON_STATECHANGE( rmesa, tcl );
+ 
+@@ -2039,7 +1895,7 @@ static void radeonLightingSpaceChange( GLcontext *ctx )
+  */
+ 
+ 
+-void radeonUploadTexMatrix( radeonContextPtr rmesa,
++void radeonUploadTexMatrix( r100ContextPtr rmesa,
+ 			    int unit, GLboolean swapcols )
+ {
+ /* Here's how this works: on r100, only 3 tex coords can be submitted, so the
+@@ -2065,7 +1921,7 @@ void radeonUploadTexMatrix( radeonContextPtr rmesa,
+    int idx = TEXMAT_0 + unit;
+    float *dest = ((float *)RADEON_DB_STATE( mat[idx] )) + MAT_ELT_0;
+    int i;
+-   struct gl_texture_unit tUnit = rmesa->glCtx->Texture.Unit[unit];
++   struct gl_texture_unit tUnit = rmesa->radeon.glCtx->Texture.Unit[unit];
+    GLfloat *src = rmesa->tmpmat[unit].m;
+ 
+    rmesa->TexMatColSwap &= ~(1 << unit);
+@@ -2119,7 +1975,7 @@ void radeonUploadTexMatrix( radeonContextPtr rmesa,
+ }
+ 
+ 
+-static void upload_matrix( radeonContextPtr rmesa, GLfloat *src, int idx )
++static void upload_matrix( r100ContextPtr rmesa, GLfloat *src, int idx )
+ {
+    float *dest = ((float *)RADEON_DB_STATE( mat[idx] ))+MAT_ELT_0;
+    int i;
+@@ -2135,7 +1991,7 @@ static void upload_matrix( radeonContextPtr rmesa, GLfloat *src, int idx )
+    RADEON_DB_STATECHANGE( rmesa, &rmesa->hw.mat[idx] );
+ }
+ 
+-static void upload_matrix_t( radeonContextPtr rmesa, GLfloat *src, int idx )
++static void upload_matrix_t( r100ContextPtr rmesa, GLfloat *src, int idx )
+ {
+    float *dest = ((float *)RADEON_DB_STATE( mat[idx] ))+MAT_ELT_0;
+    memcpy(dest, src, 16*sizeof(float));
+@@ -2145,7 +2001,7 @@ static void upload_matrix_t( radeonContextPtr rmesa, GLfloat *src, int idx )
+ 
+ static void update_texturematrix( GLcontext *ctx )
+ {
+-   radeonContextPtr rmesa = RADEON_CONTEXT( ctx );
++   r100ContextPtr rmesa = R100_CONTEXT( ctx );
+    GLuint tpc = rmesa->hw.tcl.cmd[TCL_TEXTURE_PROC_CTL];
+    GLuint vs = rmesa->hw.tcl.cmd[TCL_OUTPUT_VTXSEL];
+    int unit;
+@@ -2217,43 +2073,32 @@ static void update_texturematrix( GLcontext *ctx )
+ void
+ radeonUpdateDrawBuffer(GLcontext *ctx)
+ {
+-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
++   r100ContextPtr rmesa = R100_CONTEXT(ctx);
+    struct gl_framebuffer *fb = ctx->DrawBuffer;
+-   driRenderbuffer *drb;
++   struct radeon_renderbuffer *rrb;
+ 
+    if (fb->_ColorDrawBufferIndexes[0] == BUFFER_FRONT_LEFT) {
+-      /* draw to front */
+-      drb = (driRenderbuffer *) fb->Attachment[BUFFER_FRONT_LEFT].Renderbuffer;
+-   }
+-   else if (fb->_ColorDrawBufferIndexes[0] == BUFFER_BACK_LEFT) {
+-      /* draw to back */
+-      drb = (driRenderbuffer *) fb->Attachment[BUFFER_BACK_LEFT].Renderbuffer;
+-   }
+-   else {
+-      /* drawing to multiple buffers, or none */
+-      return;
++     /* draw to front */
++     rrb = (void *) fb->Attachment[BUFFER_FRONT_LEFT].Renderbuffer;
++   } else if (fb->_ColorDrawBufferIndexes[0] == BUFFER_BACK_LEFT) {
++     /* draw to back */
++     rrb = (void *) fb->Attachment[BUFFER_BACK_LEFT].Renderbuffer;
++   } else {
++     /* drawing to multiple buffers, or none */
++     return;
+    }
+ 
+-   assert(drb);
+-   assert(drb->flippedPitch);
++   assert(rrb);
++   assert(rrb->pitch);
+ 
+    RADEON_STATECHANGE( rmesa, ctx );
+-
+-   /* Note: we used the (possibly) page-flipped values */
+-   rmesa->hw.ctx.cmd[CTX_RB3D_COLOROFFSET]
+-     = ((drb->flippedOffset + rmesa->radeonScreen->fbLocation)
+-	& RADEON_COLOROFFSET_MASK);
+-   rmesa->hw.ctx.cmd[CTX_RB3D_COLORPITCH] = drb->flippedPitch;
+-   if (rmesa->sarea->tiling_enabled) {
+-      rmesa->hw.ctx.cmd[CTX_RB3D_COLORPITCH] |= RADEON_COLOR_TILE_ENABLE;
+-   }
+ }
+ 
+ 
+ void radeonValidateState( GLcontext *ctx )
+ {
+-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+-   GLuint new_state = rmesa->NewGLState;
++   r100ContextPtr rmesa = R100_CONTEXT(ctx);
++   GLuint new_state = rmesa->radeon.NewGLState;
+ 
+    if (new_state & (_NEW_BUFFERS | _NEW_COLOR | _NEW_PIXEL)) {
+      radeonUpdateDrawBuffer(ctx);
+@@ -2261,7 +2106,7 @@ void radeonValidateState( GLcontext *ctx )
+ 
+    if (new_state & _NEW_TEXTURE) {
+       radeonUpdateTextureState( ctx );
+-      new_state |= rmesa->NewGLState; /* may add TEXTURE_MATRIX */
++      new_state |= rmesa->radeon.NewGLState; /* may add TEXTURE_MATRIX */
+    }
+ 
+    /* Need an event driven matrix update?
+@@ -2295,7 +2140,7 @@ void radeonValidateState( GLcontext *ctx )
+    }
+ 
+ 
+-   rmesa->NewGLState = 0;
++   rmesa->radeon.NewGLState = 0;
+ }
+ 
+ 
+@@ -2306,7 +2151,7 @@ static void radeonInvalidateState( GLcontext *ctx, GLuint new_state )
+    _vbo_InvalidateState( ctx, new_state );
+    _tnl_InvalidateState( ctx, new_state );
+    _ae_invalidate_state( ctx, new_state );
+-   RADEON_CONTEXT(ctx)->NewGLState |= new_state;
++   R100_CONTEXT(ctx)->radeon.NewGLState |= new_state;
+ }
+ 
+ 
+@@ -2330,15 +2175,15 @@ static GLboolean check_material( GLcontext *ctx )
+ 
+ static void radeonWrapRunPipeline( GLcontext *ctx )
+ {
+-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
++   r100ContextPtr rmesa = R100_CONTEXT(ctx);
+    GLboolean has_material;
+ 
+    if (0)
+-      fprintf(stderr, "%s, newstate: %x\n", __FUNCTION__, rmesa->NewGLState);
++      fprintf(stderr, "%s, newstate: %x\n", __FUNCTION__, rmesa->radeon.NewGLState);
+ 
+    /* Validate state:
+     */
+-   if (rmesa->NewGLState)
++   if (rmesa->radeon.NewGLState)
+       radeonValidateState( ctx );
+ 
+    has_material = (ctx->Light.Enabled && check_material( ctx ));
+diff --git a/src/mesa/drivers/dri/radeon/radeon_state.h b/src/mesa/drivers/dri/radeon/radeon_state.h
+index 2171879..17c2b11 100644
+--- a/src/mesa/drivers/dri/radeon/radeon_state.h
++++ b/src/mesa/drivers/dri/radeon/radeon_state.h
+@@ -39,22 +39,20 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ #include "radeon_context.h"
+ 
+-extern void radeonInitState( radeonContextPtr rmesa );
++extern void radeonInitState( r100ContextPtr rmesa );
+ extern void radeonInitStateFuncs( GLcontext *ctx );
+ 
+ extern void radeonUpdateMaterial( GLcontext *ctx );
+ 
+-extern void radeonSetCliprects( radeonContextPtr rmesa );
+-extern void radeonRecalcScissorRects( radeonContextPtr rmesa );
+ extern void radeonUpdateViewportOffset( GLcontext *ctx );
+ extern void radeonUpdateWindow( GLcontext *ctx );
+ extern void radeonUpdateDrawBuffer( GLcontext *ctx );
+-extern void radeonUploadTexMatrix( radeonContextPtr rmesa,
++extern void radeonUploadTexMatrix( r100ContextPtr rmesa,
+ 				   int unit, GLboolean swapcols );
+ 
+ extern void radeonValidateState( GLcontext *ctx );
+ 
+-extern void radeonPrintDirty( radeonContextPtr rmesa,
++extern void radeonPrintDirty( r100ContextPtr rmesa,
+ 			      const char *msg );
+ 
+ 
+@@ -62,7 +60,7 @@ extern void radeonFallback( GLcontext *ctx, GLuint bit, GLboolean mode );
+ #define FALLBACK( rmesa, bit, mode ) do {				\
+    if ( 0 ) fprintf( stderr, "FALLBACK in %s: #%d=%d\n",		\
+ 		     __FUNCTION__, bit, mode );				\
+-   radeonFallback( rmesa->glCtx, bit, mode );				\
++   radeonFallback( rmesa->radeon.glCtx, bit, mode );				\
+ } while (0)
+ 
+ 
+diff --git a/src/mesa/drivers/dri/radeon/radeon_state_init.c b/src/mesa/drivers/dri/radeon/radeon_state_init.c
+index 57dc380..7ff0eb4 100644
+--- a/src/mesa/drivers/dri/radeon/radeon_state_init.c
++++ b/src/mesa/drivers/dri/radeon/radeon_state_init.c
+@@ -38,39 +38,156 @@
+ #include "swrast_setup/swrast_setup.h"
+ 
+ #include "radeon_context.h"
++#include "radeon_mipmap_tree.h"
+ #include "radeon_ioctl.h"
+ #include "radeon_state.h"
+ #include "radeon_tcl.h"
+ #include "radeon_tex.h"
+ #include "radeon_swtcl.h"
+ 
++#include "../r200/r200_reg.h"
++
+ #include "xmlpool.h"
+ 
++/* New (1.3) state mechanism.  3 commands (packet, scalar, vector) in
++ * 1.3 cmdbuffers allow all previous state to be updated as well as
++ * the tcl scalar and vector areas.
 + */
-+GLboolean radeon_miptree_matches_texture(radeon_mipmap_tree *mt, struct gl_texture_object *texObj)
++static struct {
++	int start;
++	int len;
++	const char *name;
++} packet[RADEON_MAX_STATE_PACKETS] = {
++	{RADEON_PP_MISC, 7, "RADEON_PP_MISC"},
++	{RADEON_PP_CNTL, 3, "RADEON_PP_CNTL"},
++	{RADEON_RB3D_COLORPITCH, 1, "RADEON_RB3D_COLORPITCH"},
++	{RADEON_RE_LINE_PATTERN, 2, "RADEON_RE_LINE_PATTERN"},
++	{RADEON_SE_LINE_WIDTH, 1, "RADEON_SE_LINE_WIDTH"},
++	{RADEON_PP_LUM_MATRIX, 1, "RADEON_PP_LUM_MATRIX"},
++	{RADEON_PP_ROT_MATRIX_0, 2, "RADEON_PP_ROT_MATRIX_0"},
++	{RADEON_RB3D_STENCILREFMASK, 3, "RADEON_RB3D_STENCILREFMASK"},
++	{RADEON_SE_VPORT_XSCALE, 6, "RADEON_SE_VPORT_XSCALE"},
++	{RADEON_SE_CNTL, 2, "RADEON_SE_CNTL"},
++	{RADEON_SE_CNTL_STATUS, 1, "RADEON_SE_CNTL_STATUS"},
++	{RADEON_RE_MISC, 1, "RADEON_RE_MISC"},
++	{RADEON_PP_TXFILTER_0, 6, "RADEON_PP_TXFILTER_0"},
++	{RADEON_PP_BORDER_COLOR_0, 1, "RADEON_PP_BORDER_COLOR_0"},
++	{RADEON_PP_TXFILTER_1, 6, "RADEON_PP_TXFILTER_1"},
++	{RADEON_PP_BORDER_COLOR_1, 1, "RADEON_PP_BORDER_COLOR_1"},
++	{RADEON_PP_TXFILTER_2, 6, "RADEON_PP_TXFILTER_2"},
++	{RADEON_PP_BORDER_COLOR_2, 1, "RADEON_PP_BORDER_COLOR_2"},
++	{RADEON_SE_ZBIAS_FACTOR, 2, "RADEON_SE_ZBIAS_FACTOR"},
++	{RADEON_SE_TCL_OUTPUT_VTX_FMT, 11, "RADEON_SE_TCL_OUTPUT_VTX_FMT"},
++	{RADEON_SE_TCL_MATERIAL_EMMISSIVE_RED, 17,
++		    "RADEON_SE_TCL_MATERIAL_EMMISSIVE_RED"},
++	{R200_PP_TXCBLEND_0, 4, "R200_PP_TXCBLEND_0"},
++	{R200_PP_TXCBLEND_1, 4, "R200_PP_TXCBLEND_1"},
++	{R200_PP_TXCBLEND_2, 4, "R200_PP_TXCBLEND_2"},
++	{R200_PP_TXCBLEND_3, 4, "R200_PP_TXCBLEND_3"},
++	{R200_PP_TXCBLEND_4, 4, "R200_PP_TXCBLEND_4"},
++	{R200_PP_TXCBLEND_5, 4, "R200_PP_TXCBLEND_5"},
++	{R200_PP_TXCBLEND_6, 4, "R200_PP_TXCBLEND_6"},
++	{R200_PP_TXCBLEND_7, 4, "R200_PP_TXCBLEND_7"},
++	{R200_SE_TCL_LIGHT_MODEL_CTL_0, 6, "R200_SE_TCL_LIGHT_MODEL_CTL_0"},
++	{R200_PP_TFACTOR_0, 6, "R200_PP_TFACTOR_0"},
++	{R200_SE_VTX_FMT_0, 4, "R200_SE_VTX_FMT_0"},
++	{R200_SE_VAP_CNTL, 1, "R200_SE_VAP_CNTL"},
++	{R200_SE_TCL_MATRIX_SEL_0, 5, "R200_SE_TCL_MATRIX_SEL_0"},
++	{R200_SE_TCL_TEX_PROC_CTL_2, 5, "R200_SE_TCL_TEX_PROC_CTL_2"},
++	{R200_SE_TCL_UCP_VERT_BLEND_CTL, 1, "R200_SE_TCL_UCP_VERT_BLEND_CTL"},
++	{R200_PP_TXFILTER_0, 6, "R200_PP_TXFILTER_0"},
++	{R200_PP_TXFILTER_1, 6, "R200_PP_TXFILTER_1"},
++	{R200_PP_TXFILTER_2, 6, "R200_PP_TXFILTER_2"},
++	{R200_PP_TXFILTER_3, 6, "R200_PP_TXFILTER_3"},
++	{R200_PP_TXFILTER_4, 6, "R200_PP_TXFILTER_4"},
++	{R200_PP_TXFILTER_5, 6, "R200_PP_TXFILTER_5"},
++	{R200_PP_TXOFFSET_0, 1, "R200_PP_TXOFFSET_0"},
++	{R200_PP_TXOFFSET_1, 1, "R200_PP_TXOFFSET_1"},
++	{R200_PP_TXOFFSET_2, 1, "R200_PP_TXOFFSET_2"},
++	{R200_PP_TXOFFSET_3, 1, "R200_PP_TXOFFSET_3"},
++	{R200_PP_TXOFFSET_4, 1, "R200_PP_TXOFFSET_4"},
++	{R200_PP_TXOFFSET_5, 1, "R200_PP_TXOFFSET_5"},
++	{R200_SE_VTE_CNTL, 1, "R200_SE_VTE_CNTL"},
++	{R200_SE_TCL_OUTPUT_VTX_COMP_SEL, 1,
++	 "R200_SE_TCL_OUTPUT_VTX_COMP_SEL"},
++	{R200_PP_TAM_DEBUG3, 1, "R200_PP_TAM_DEBUG3"},
++	{R200_PP_CNTL_X, 1, "R200_PP_CNTL_X"},
++	{R200_RB3D_DEPTHXY_OFFSET, 1, "R200_RB3D_DEPTHXY_OFFSET"},
++	{R200_RE_AUX_SCISSOR_CNTL, 1, "R200_RE_AUX_SCISSOR_CNTL"},
++	{R200_RE_SCISSOR_TL_0, 2, "R200_RE_SCISSOR_TL_0"},
++	{R200_RE_SCISSOR_TL_1, 2, "R200_RE_SCISSOR_TL_1"},
++	{R200_RE_SCISSOR_TL_2, 2, "R200_RE_SCISSOR_TL_2"},
++	{R200_SE_VAP_CNTL_STATUS, 1, "R200_SE_VAP_CNTL_STATUS"},
++	{R200_SE_VTX_STATE_CNTL, 1, "R200_SE_VTX_STATE_CNTL"},
++	{R200_RE_POINTSIZE, 1, "R200_RE_POINTSIZE"},
++	{R200_SE_TCL_INPUT_VTX_VECTOR_ADDR_0, 4,
++		    "R200_SE_TCL_INPUT_VTX_VECTOR_ADDR_0"},
++	{R200_PP_CUBIC_FACES_0, 1, "R200_PP_CUBIC_FACES_0"},	/* 61 */
++	{R200_PP_CUBIC_OFFSET_F1_0, 5, "R200_PP_CUBIC_OFFSET_F1_0"}, /* 62 */
++	{R200_PP_CUBIC_FACES_1, 1, "R200_PP_CUBIC_FACES_1"},
++	{R200_PP_CUBIC_OFFSET_F1_1, 5, "R200_PP_CUBIC_OFFSET_F1_1"},
++	{R200_PP_CUBIC_FACES_2, 1, "R200_PP_CUBIC_FACES_2"},
++	{R200_PP_CUBIC_OFFSET_F1_2, 5, "R200_PP_CUBIC_OFFSET_F1_2"},
++	{R200_PP_CUBIC_FACES_3, 1, "R200_PP_CUBIC_FACES_3"},
++	{R200_PP_CUBIC_OFFSET_F1_3, 5, "R200_PP_CUBIC_OFFSET_F1_3"},
++	{R200_PP_CUBIC_FACES_4, 1, "R200_PP_CUBIC_FACES_4"},
++	{R200_PP_CUBIC_OFFSET_F1_4, 5, "R200_PP_CUBIC_OFFSET_F1_4"},
++	{R200_PP_CUBIC_FACES_5, 1, "R200_PP_CUBIC_FACES_5"},
++	{R200_PP_CUBIC_OFFSET_F1_5, 5, "R200_PP_CUBIC_OFFSET_F1_5"},
++	{RADEON_PP_TEX_SIZE_0, 2, "RADEON_PP_TEX_SIZE_0"},
++	{RADEON_PP_TEX_SIZE_1, 2, "RADEON_PP_TEX_SIZE_1"},
++	{RADEON_PP_TEX_SIZE_2, 2, "RADEON_PP_TEX_SIZE_2"},
++	{R200_RB3D_BLENDCOLOR, 3, "R200_RB3D_BLENDCOLOR"},
++	{R200_SE_TCL_POINT_SPRITE_CNTL, 1, "R200_SE_TCL_POINT_SPRITE_CNTL"},
++	{RADEON_PP_CUBIC_FACES_0, 1, "RADEON_PP_CUBIC_FACES_0"},
++	{RADEON_PP_CUBIC_OFFSET_T0_0, 5, "RADEON_PP_CUBIC_OFFSET_T0_0"},
++	{RADEON_PP_CUBIC_FACES_1, 1, "RADEON_PP_CUBIC_FACES_1"},
++	{RADEON_PP_CUBIC_OFFSET_T1_0, 5, "RADEON_PP_CUBIC_OFFSET_T1_0"},
++	{RADEON_PP_CUBIC_FACES_2, 1, "RADEON_PP_CUBIC_FACES_2"},
++	{RADEON_PP_CUBIC_OFFSET_T2_0, 5, "RADEON_PP_CUBIC_OFFSET_T2_0"},
++	{R200_PP_TRI_PERF, 2, "R200_PP_TRI_PERF"},
++	{R200_PP_TXCBLEND_8, 32, "R200_PP_AFS_0"},     /* 85 */
++	{R200_PP_TXCBLEND_0, 32, "R200_PP_AFS_1"},
++	{R200_PP_TFACTOR_0, 8, "R200_ATF_TFACTOR"},
++	{R200_PP_TXFILTER_0, 8, "R200_PP_TXCTLALL_0"},
++	{R200_PP_TXFILTER_1, 8, "R200_PP_TXCTLALL_1"},
++	{R200_PP_TXFILTER_2, 8, "R200_PP_TXCTLALL_2"},
++	{R200_PP_TXFILTER_3, 8, "R200_PP_TXCTLALL_3"},
++	{R200_PP_TXFILTER_4, 8, "R200_PP_TXCTLALL_4"},
++	{R200_PP_TXFILTER_5, 8, "R200_PP_TXCTLALL_5"},
++	{R200_VAP_PVS_CNTL_1, 2, "R200_VAP_PVS_CNTL"},
++};
++
+ /* =============================================================
+  * State initialization
+  */
+ 
+-void radeonPrintDirty( radeonContextPtr rmesa, const char *msg )
++void radeonPrintDirty( r100ContextPtr rmesa, const char *msg )
+ {
+    struct radeon_state_atom *l;
+ 
+    fprintf(stderr, msg);
+    fprintf(stderr, ": ");
+ 
+-   foreach(l, &rmesa->hw.atomlist) {
+-      if (l->dirty || rmesa->hw.all_dirty)
++   foreach(l, &rmesa->radeon.hw.atomlist) {
++      if (l->dirty || rmesa->radeon.hw.all_dirty)
+ 	 fprintf(stderr, "%s, ", l->name);
+    }
+ 
+    fprintf(stderr, "\n");
+ }
+ 
+-static int cmdpkt( int id ) 
++static int cmdpkt( r100ContextPtr rmesa, int id ) 
+ {
+    drm_radeon_cmd_header_t h;
+-   h.i = 0;
+-   h.packet.cmd_type = RADEON_CMD_PACKET;
+-   h.packet.packet_id = id;
++
++   if (rmesa->radeon.radeonScreen->kernel_mm) {
++     return CP_PACKET0(packet[id].start, packet[id].len - 1);
++   } else {
++     h.i = 0;
++     h.packet.cmd_type = RADEON_CMD_PACKET;
++     h.packet.packet_id = id;
++   }
+    return h.i;
+ }
+ 
+@@ -96,17 +213,17 @@ static int cmdscl( int offset, int stride, int count )
+    return h.i;
+ }
+ 
+-#define CHECK( NM, FLAG )			\
+-static GLboolean check_##NM( GLcontext *ctx )	\
+-{						\
+-   return FLAG;					\
++#define CHECK( NM, FLAG )				\
++static int check_##NM( GLcontext *ctx, struct radeon_state_atom *atom )	\
++{							\
++   return FLAG ? atom->cmd_size : 0;			\
+ }
+ 
+ #define TCL_CHECK( NM, FLAG )				\
+-static GLboolean check_##NM( GLcontext *ctx )		\
++static int check_##NM( GLcontext *ctx, struct radeon_state_atom *atom )	\
+ {							\
+-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);	\
+-   return !rmesa->TclFallback && (FLAG);		\
++   r100ContextPtr rmesa = R100_CONTEXT(ctx);	\
++   return (!rmesa->radeon.TclFallback && (FLAG)) ? atom->cmd_size : 0;	\
+ }
+ 
+ 
+@@ -146,42 +263,290 @@ CHECK( txr0, (ctx->Texture.Unit[0]._ReallyEnabled & TEXTURE_RECT_BIT))
+ CHECK( txr1, (ctx->Texture.Unit[1]._ReallyEnabled & TEXTURE_RECT_BIT))
+ CHECK( txr2, (ctx->Texture.Unit[2]._ReallyEnabled & TEXTURE_RECT_BIT))
+ 
++#define OUT_VEC(hdr, data) do {			\
++    drm_radeon_cmd_header_t h;					\
++    h.i = hdr;								\
++    OUT_BATCH(CP_PACKET0(RADEON_SE_TCL_STATE_FLUSH, 0));		\
++    OUT_BATCH(0);							\
++    OUT_BATCH(CP_PACKET0(R200_SE_TCL_VECTOR_INDX_REG, 0));		\
++    OUT_BATCH(h.vectors.offset | (h.vectors.stride << RADEON_VEC_INDX_OCTWORD_STRIDE_SHIFT)); \
++    OUT_BATCH(CP_PACKET0_ONE(R200_SE_TCL_VECTOR_DATA_REG, h.vectors.count - 1));	\
++    OUT_BATCH_TABLE((data), h.vectors.count);				\
++  } while(0)
++
++#define OUT_SCL(hdr, data) do {					\
++    drm_radeon_cmd_header_t h;						\
++    h.i = hdr;								\
++    OUT_BATCH(CP_PACKET0(R200_SE_TCL_SCALAR_INDX_REG, 0));		\
++    OUT_BATCH((h.scalars.offset) | (h.scalars.stride << RADEON_SCAL_INDX_DWORD_STRIDE_SHIFT)); \
++    OUT_BATCH(CP_PACKET0_ONE(R200_SE_TCL_SCALAR_DATA_REG, h.scalars.count - 1));	\
++    OUT_BATCH_TABLE((data), h.scalars.count);				\
++  } while(0)
++
++static void scl_emit(GLcontext *ctx, struct radeon_state_atom *atom)
 +{
-+	struct gl_texture_image *firstImage;
-+	GLuint compressed;
-+	GLuint numfaces = 1;
-+	GLuint firstLevel, lastLevel;
++   r100ContextPtr r100 = R100_CONTEXT(ctx);
++   BATCH_LOCALS(&r100->radeon);
++   uint32_t dwords = atom->cmd_size;
++   
++   dwords += 2;
++   BEGIN_BATCH_NO_AUTOSTATE(dwords);
++   OUT_SCL(atom->cmd[0], atom->cmd+1);
++   END_BATCH();
++}
+ 
+ 
+-/* Initialize the context's hardware state.
+- */
+-void radeonInitState( radeonContextPtr rmesa )
++static void vec_emit(GLcontext *ctx, struct radeon_state_atom *atom)
+ {
+-   GLcontext *ctx = rmesa->glCtx;
+-   GLuint color_fmt, depth_fmt, i;
+-   GLint drawPitch, drawOffset;
++   r100ContextPtr r100 = R100_CONTEXT(ctx);
++   BATCH_LOCALS(&r100->radeon);
++   uint32_t dwords = atom->cmd_size;
++
++   dwords += 4;
++   BEGIN_BATCH_NO_AUTOSTATE(dwords);
++   OUT_VEC(atom->cmd[0], atom->cmd+1);
++   END_BATCH();
++}
+ 
+-   switch ( rmesa->radeonScreen->cpp ) {
+-   case 2:
+-      color_fmt = RADEON_COLOR_FORMAT_RGB565;
+-      break;
+-   case 4:
+-      color_fmt = RADEON_COLOR_FORMAT_ARGB8888;
+-      break;
+-   default:
+-      fprintf( stderr, "Error: Unsupported pixel depth... exiting\n" );
+-      exit( -1 );
++
++static void lit_emit(GLcontext *ctx, struct radeon_state_atom *atom)
++{
++   r100ContextPtr r100 = R100_CONTEXT(ctx);
++   BATCH_LOCALS(&r100->radeon);
++   uint32_t dwords = atom->cmd_size;
++
++   dwords += 6;
++   BEGIN_BATCH_NO_AUTOSTATE(dwords);
++   OUT_VEC(atom->cmd[LIT_CMD_0], atom->cmd+1);
++   OUT_SCL(atom->cmd[LIT_CMD_1], atom->cmd+LIT_CMD_1+1);
++   END_BATCH();
++}
 +
-+	calculate_first_last_level(texObj, &firstLevel, &lastLevel, 0, texObj->BaseLevel);
-+	if (texObj->Target == GL_TEXTURE_CUBE_MAP)
-+		numfaces = 6;
++static void ctx_emit(GLcontext *ctx, struct radeon_state_atom *atom)
++{
++   r100ContextPtr r100 = R100_CONTEXT(ctx);
++   BATCH_LOCALS(&r100->radeon);
++   struct radeon_renderbuffer *rrb;
++   uint32_t cbpitch;
++   uint32_t zbpitch, depth_fmt;
++   uint32_t dwords = atom->cmd_size;
++
++   /* output the first 7 bytes of context */
++   BEGIN_BATCH_NO_AUTOSTATE(dwords + 4);
++   OUT_BATCH_TABLE(atom->cmd, 5);
++
++   rrb = radeon_get_depthbuffer(&r100->radeon);
++   if (!rrb) {
++     OUT_BATCH(0);
++     OUT_BATCH(0);
++   } else {
++     zbpitch = (rrb->pitch / rrb->cpp);
++     if (r100->using_hyperz)
++       zbpitch |= RADEON_DEPTH_HYPERZ;
++
++     OUT_BATCH_RELOC(0, rrb->bo, 0, 0, RADEON_GEM_DOMAIN_VRAM, 0);
++     OUT_BATCH(zbpitch);
++     if (rrb->cpp == 4)
++        depth_fmt = RADEON_DEPTH_FORMAT_24BIT_INT_Z;
++     else
++        depth_fmt = RADEON_DEPTH_FORMAT_16BIT_INT_Z;
++     atom->cmd[CTX_RB3D_ZSTENCILCNTL] &= ~RADEON_DEPTH_FORMAT_MASK;
++     atom->cmd[CTX_RB3D_ZSTENCILCNTL] |= depth_fmt;
++   }
++     
++   OUT_BATCH(atom->cmd[CTX_RB3D_ZSTENCILCNTL]);
++   OUT_BATCH(atom->cmd[CTX_CMD_1]);
++   OUT_BATCH(atom->cmd[CTX_PP_CNTL]);
++
++   rrb = radeon_get_colorbuffer(&r100->radeon);
++   if (!rrb || !rrb->bo) {
++      OUT_BATCH(atom->cmd[CTX_RB3D_CNTL]);
++      OUT_BATCH(atom->cmd[CTX_RB3D_COLOROFFSET]);
++   } else {
++      atom->cmd[CTX_RB3D_CNTL] &= ~(0xf << 10);
++      if (rrb->cpp == 4)
++         atom->cmd[CTX_RB3D_CNTL] |= RADEON_COLOR_FORMAT_ARGB8888;
++      else
++         atom->cmd[CTX_RB3D_CNTL] |= RADEON_COLOR_FORMAT_RGB565;
++
++      OUT_BATCH(atom->cmd[CTX_RB3D_CNTL]);
++      OUT_BATCH_RELOC(0, rrb->bo, 0, 0, RADEON_GEM_DOMAIN_VRAM, 0);
++   }
 +
-+	firstImage = texObj->Image[0][firstLevel];
-+	compressed = firstImage->IsCompressed ? firstImage->TexFormat->MesaFormat : 0;
++   OUT_BATCH(atom->cmd[CTX_CMD_2]);
 +
-+	return (mt->firstLevel == firstLevel &&
-+	        mt->lastLevel == lastLevel &&
-+	        mt->width0 == firstImage->Width &&
-+	        mt->height0 == firstImage->Height &&
-+	        mt->depth0 == firstImage->Depth &&
-+	        mt->bpp == firstImage->TexFormat->TexelBytes &&
-+	        mt->compressed == compressed);
-+}
++   if (!rrb || !rrb->bo) {
++     OUT_BATCH(atom->cmd[CTX_RB3D_COLORPITCH]);
++   } else {
++     cbpitch = (rrb->pitch / rrb->cpp);
++     if (rrb->bo->flags & RADEON_BO_FLAGS_MACRO_TILE)
++       cbpitch |= RADEON_COLOR_TILE_ENABLE;
++     OUT_BATCH(cbpitch);
++   }
 +
++   END_BATCH();
++}
 +
-+/**
-+ * Try to allocate a mipmap tree for the given texture that will fit the
-+ * given image in the given position.
-+ */
-+void radeon_try_alloc_miptree(radeonContextPtr rmesa, radeonTexObj *t,
-+		struct gl_texture_image *texImage, GLuint face, GLuint level)
++static void ctx_emit_cs(GLcontext *ctx, struct radeon_state_atom *atom)
 +{
-+	GLuint compressed = texImage->IsCompressed ? texImage->TexFormat->MesaFormat : 0;
-+	GLuint numfaces = 1;
-+	GLuint firstLevel, lastLevel;
++   r100ContextPtr r100 = R100_CONTEXT(ctx);
++   BATCH_LOCALS(&r100->radeon);
++   struct radeon_renderbuffer *rrb, *drb;
++   uint32_t cbpitch = 0;
++   uint32_t zbpitch = 0;
++   uint32_t dwords = atom->cmd_size;
++   uint32_t depth_fmt;
++
++   rrb = radeon_get_colorbuffer(&r100->radeon);
++   if (!rrb || !rrb->bo) {
++      fprintf(stderr, "no rrb\n");
++      return;
++   }
 +
-+	assert(!t->mt);
++   atom->cmd[CTX_RB3D_CNTL] &= ~(0xf << 10);
++   if (rrb->cpp == 4)
++	atom->cmd[CTX_RB3D_CNTL] |= RADEON_COLOR_FORMAT_ARGB8888;
++   else
++	atom->cmd[CTX_RB3D_CNTL] |= RADEON_COLOR_FORMAT_RGB565;
++
++   cbpitch = (rrb->pitch / rrb->cpp);
++   if (rrb->bo->flags & RADEON_BO_FLAGS_MACRO_TILE)
++       cbpitch |= R200_COLOR_TILE_ENABLE;
++
++   drb = radeon_get_depthbuffer(&r100->radeon);
++   if (drb) {
++     zbpitch = (drb->pitch / drb->cpp);
++     if (drb->cpp == 4)
++        depth_fmt = RADEON_DEPTH_FORMAT_24BIT_INT_Z;
++     else
++        depth_fmt = RADEON_DEPTH_FORMAT_16BIT_INT_Z;
++     atom->cmd[CTX_RB3D_ZSTENCILCNTL] &= ~RADEON_DEPTH_FORMAT_MASK;
++     atom->cmd[CTX_RB3D_ZSTENCILCNTL] |= depth_fmt;
++     
++   }
 +
-+	calculate_first_last_level(&t->base, &firstLevel, &lastLevel, face, level);
-+	if (t->base.Target == GL_TEXTURE_CUBE_MAP)
-+		numfaces = 6;
++   /* output the first 7 bytes of context */
++   if (drb)
++     dwords += 4;
++   if (rrb)
++     dwords += 4;
++   BEGIN_BATCH_NO_AUTOSTATE(dwords);
 +
-+	if (level != firstLevel || face >= numfaces)
-+		return;
++   /* In the CS case we need to split this up */
++   OUT_BATCH(CP_PACKET0(packet[0].start, 3));
++   OUT_BATCH_TABLE((atom->cmd + 1), 4);
 +
-+	t->mt = radeon_miptree_create(rmesa, t, t->base.Target,
-+		firstLevel, lastLevel,
-+		texImage->Width, texImage->Height, texImage->Depth,
-+		texImage->TexFormat->TexelBytes, t->tile_bits, compressed);
-+}
-diff --git a/src/mesa/drivers/dri/radeon/radeon_mipmap_tree.h b/src/mesa/drivers/dri/radeon/radeon_mipmap_tree.h
-new file mode 100644
-index 0000000..43dfa48
---- /dev/null
-+++ b/src/mesa/drivers/dri/radeon/radeon_mipmap_tree.h
-@@ -0,0 +1,97 @@
-+/*
-+ * Copyright (C) 2008 Nicolai Haehnle.
-+ *
-+ * All Rights Reserved.
-+ *
-+ * Permission is hereby granted, free of charge, to any person obtaining
-+ * a copy of this software and associated documentation files (the
-+ * "Software"), to deal in the Software without restriction, including
-+ * without limitation the rights to use, copy, modify, merge, publish,
-+ * distribute, sublicense, and/or sell copies of the Software, and to
-+ * permit persons to whom the Software is furnished to do so, subject to
-+ * the following conditions:
-+ *
-+ * The above copyright notice and this permission notice (including the
-+ * next paragraph) shall be included in all copies or substantial
-+ * portions of the Software.
-+ *
-+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
-+ * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
-+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
-+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
-+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-+ *
-+ */
++   if (drb) {
++     OUT_BATCH(CP_PACKET0(RADEON_RB3D_DEPTHOFFSET, 0));
++     OUT_BATCH_RELOC(0, rrb->bo, 0, 0, RADEON_GEM_DOMAIN_VRAM, 0);
 +
-+#ifndef __RADEON_MIPMAP_TREE_H_
-+#define __RADEON_MIPMAP_TREE_H_
++     OUT_BATCH(CP_PACKET0(RADEON_RB3D_DEPTHPITCH, 0));
++     OUT_BATCH(zbpitch);
++   }
 +
-+#include "radeon_common.h"
++   OUT_BATCH(CP_PACKET0(RADEON_RB3D_ZSTENCILCNTL, 0));
++   OUT_BATCH(atom->cmd[CTX_RB3D_ZSTENCILCNTL]);
++   OUT_BATCH(CP_PACKET0(RADEON_PP_CNTL, 1));
++   OUT_BATCH(atom->cmd[CTX_PP_CNTL]);
++   OUT_BATCH(atom->cmd[CTX_RB3D_CNTL]);
++
++   if (rrb) {
++     OUT_BATCH(CP_PACKET0(RADEON_RB3D_COLOROFFSET, 0));
++     OUT_BATCH_RELOC(0, rrb->bo, 0, 0, RADEON_GEM_DOMAIN_VRAM, 0);
+    }
+ 
+-   rmesa->state.color.clear = 0x00000000;
++   if (rrb) {
++     OUT_BATCH(CP_PACKET0(RADEON_RB3D_COLORPITCH, 0));
++     OUT_BATCH(cbpitch);
++   }
 +
-+typedef struct _radeon_mipmap_tree radeon_mipmap_tree;
-+typedef struct _radeon_mipmap_level radeon_mipmap_level;
-+typedef struct _radeon_mipmap_image radeon_mipmap_image;
++   // if (atom->cmd_size == CTX_STATE_SIZE_NEWDRM) {
++   //   OUT_BATCH_TABLE((atom->cmd + 14), 4);
++   // }
 +
-+struct _radeon_mipmap_image {
-+	GLuint offset; /** Offset of this image from the start of mipmap tree buffer, in bytes */
-+};
++   END_BATCH();
++}
 +
-+struct _radeon_mipmap_level {
-+	GLuint width;
-+	GLuint height;
-+	GLuint depth;
-+	GLuint size; /** Size of each image, in bytes */
-+	GLuint rowstride; /** in bytes */
-+	radeon_mipmap_image faces[6];
-+};
++static void cube_emit(GLcontext *ctx, struct radeon_state_atom *atom)
++{
++   r100ContextPtr r100 = R100_CONTEXT(ctx);
++   BATCH_LOCALS(&r100->radeon);
++   uint32_t dwords = atom->cmd_size;
++   int i = atom->idx, j;
++   radeonTexObj *t = r100->state.texture.unit[i].texobj;
++   radeon_mipmap_level *lvl;
++
++   if (!(ctx->Texture.Unit[i]._ReallyEnabled & TEXTURE_CUBE_BIT))
++	return;
++
++   if (!t)
++	return;
++
++   if (!t->mt)
++	return;
++
++   BEGIN_BATCH_NO_AUTOSTATE(dwords + 10);
++   OUT_BATCH_TABLE(atom->cmd, 3);
++   lvl = &t->mt->levels[0];
++   for (j = 0; j < 5; j++) {
++	OUT_BATCH_RELOC(lvl->faces[j].offset, t->mt->bo, lvl->faces[j].offset,
++			RADEON_GEM_DOMAIN_VRAM, 0, 0);
++   }
++   END_BATCH();
++}
 +
++static void tex_emit(GLcontext *ctx, struct radeon_state_atom *atom)
++{
++   r100ContextPtr r100 = R100_CONTEXT(ctx);
++   BATCH_LOCALS(&r100->radeon);
++   uint32_t dwords = atom->cmd_size;
++   int i = atom->idx;
++   radeonTexObj *t = r100->state.texture.unit[i].texobj;
++   radeon_mipmap_level *lvl;
++
++   if (t && t->mt && !t->image_override)
++     dwords += 2;
++   BEGIN_BATCH_NO_AUTOSTATE(dwords);
++   OUT_BATCH_TABLE(atom->cmd, 3);
++   if (t && t->mt && !t->image_override) {
++     if ((ctx->Texture.Unit[i]._ReallyEnabled & TEXTURE_CUBE_BIT)) {
++   	lvl = &t->mt->levels[0];
++	OUT_BATCH_RELOC(lvl->faces[5].offset, t->mt->bo, lvl->faces[5].offset,
++			RADEON_GEM_DOMAIN_VRAM, 0, 0);
++     } else {
++        OUT_BATCH_RELOC(t->tile_bits, t->mt->bo, 0,
++		     RADEON_GEM_DOMAIN_VRAM, 0, 0);
++     }
++   } else if (!t) {
++     /* workaround for old CS mechanism */
++     OUT_BATCH(r100->radeon.radeonScreen->texOffset[RADEON_LOCAL_TEX_HEAP]);
++     //     OUT_BATCH(r100->radeon.radeonScreen);
++   } else if (t->image_override)
++     OUT_BATCH(t->override_offset);
++
++   OUT_BATCH_TABLE((atom->cmd+4), 5);
++   END_BATCH();
++}
 +
-+/**
-+ * A mipmap tree contains texture images in the layout that the hardware
-+ * expects.
-+ *
-+ * The meta-data of mipmap trees is immutable, i.e. you cannot change the
-+ * layout on-the-fly; however, the texture contents (i.e. texels) can be
-+ * changed.
++/* Initialize the context's hardware state.
 + */
-+struct _radeon_mipmap_tree {
-+	radeonContextPtr radeon;
-+	radeonTexObj *t;
-+	struct radeon_bo *bo;
-+	GLuint refcount;
++void radeonInitState( r100ContextPtr rmesa )
++{
++   GLcontext *ctx = rmesa->radeon.glCtx;
++   GLuint i;
++
++   rmesa->radeon.state.color.clear = 0x00000000;
+ 
+    switch ( ctx->Visual.depthBits ) {
+    case 16:
+-      rmesa->state.depth.clear = 0x0000ffff;
+-      rmesa->state.depth.scale = 1.0 / (GLfloat)0xffff;
+-      depth_fmt = RADEON_DEPTH_FORMAT_16BIT_INT_Z;
+-      rmesa->state.stencil.clear = 0x00000000;
++      rmesa->radeon.state.depth.clear = 0x0000ffff;
++      rmesa->radeon.state.depth.scale = 1.0 / (GLfloat)0xffff;
++      rmesa->radeon.state.stencil.clear = 0x00000000;
+       break;
+    case 24:
+-      rmesa->state.depth.clear = 0x00ffffff;
+-      rmesa->state.depth.scale = 1.0 / (GLfloat)0xffffff;
+-      depth_fmt = RADEON_DEPTH_FORMAT_24BIT_INT_Z;
+-      rmesa->state.stencil.clear = 0xffff0000;
++      rmesa->radeon.state.depth.clear = 0x00ffffff;
++      rmesa->radeon.state.depth.scale = 1.0 / (GLfloat)0xffffff;
++      rmesa->radeon.state.stencil.clear = 0xffff0000;
+       break;
+    default:
+       fprintf( stderr, "Error: Unsupported depth %d... exiting\n",
+@@ -190,37 +555,37 @@ void radeonInitState( radeonContextPtr rmesa )
+    }
+ 
+    /* Only have hw stencil when depth buffer is 24 bits deep */
+-   rmesa->state.stencil.hwBuffer = ( ctx->Visual.stencilBits > 0 &&
++   rmesa->radeon.state.stencil.hwBuffer = ( ctx->Visual.stencilBits > 0 &&
+ 				     ctx->Visual.depthBits == 24 );
+ 
+-   rmesa->Fallback = 0;
++   rmesa->radeon.Fallback = 0;
+ 
+-   if ( ctx->Visual.doubleBufferMode && rmesa->sarea->pfCurrentPage == 0 ) {
+-      drawOffset = rmesa->radeonScreen->backOffset;
+-      drawPitch  = rmesa->radeonScreen->backPitch;
+-   } else {
+-      drawOffset = rmesa->radeonScreen->frontOffset;
+-      drawPitch  = rmesa->radeonScreen->frontPitch;
+-   }
+ 
+-   rmesa->hw.max_state_size = 0;
++   rmesa->radeon.hw.max_state_size = 0;
+ 
+-#define ALLOC_STATE( ATOM, CHK, SZ, NM, FLAG )				\
++#define ALLOC_STATE_IDX( ATOM, CHK, SZ, NM, FLAG, IDX )		\
+    do {								\
+       rmesa->hw.ATOM.cmd_size = SZ;				\
+-      rmesa->hw.ATOM.cmd = (int *)CALLOC(SZ * sizeof(int));	\
+-      rmesa->hw.ATOM.lastcmd = (int *)CALLOC(SZ * sizeof(int));	\
+-      rmesa->hw.ATOM.name = NM;					\
++      rmesa->hw.ATOM.cmd = (GLuint *)CALLOC(SZ * sizeof(int));	\
++      rmesa->hw.ATOM.lastcmd = (GLuint *)CALLOC(SZ * sizeof(int)); \
++      rmesa->hw.ATOM.name = NM;						\
+       rmesa->hw.ATOM.is_tcl = FLAG;					\
+       rmesa->hw.ATOM.check = check_##CHK;				\
+-      rmesa->hw.ATOM.dirty = GL_TRUE;				\
+-      rmesa->hw.max_state_size += SZ * sizeof(int);		\
++      rmesa->hw.ATOM.dirty = GL_TRUE;					\
++      rmesa->hw.ATOM.idx = IDX;					\
++      rmesa->radeon.hw.max_state_size += SZ * sizeof(int);		\
+    } while (0)
+-      
+-      
++
++#define ALLOC_STATE( ATOM, CHK, SZ, NM, FLAG )		\
++   ALLOC_STATE_IDX(ATOM, CHK, SZ, NM, FLAG, 0)
++
+    /* Allocate state buffers:
+     */
+    ALLOC_STATE( ctx, always, CTX_STATE_SIZE, "CTX/context", 0 );
++   if (rmesa->radeon.radeonScreen->kernel_mm)
++     rmesa->hw.ctx.emit = ctx_emit_cs;
++   else
++     rmesa->hw.ctx.emit = ctx_emit;
+    ALLOC_STATE( lin, always, LIN_STATE_SIZE, "LIN/line", 0 );
+    ALLOC_STATE( msk, always, MSK_STATE_SIZE, "MSK/mask", 0 );
+    ALLOC_STATE( vpt, always, VPT_STATE_SIZE, "VPT/viewport", 0 );
+@@ -233,20 +598,25 @@ void radeonInitState( radeonContextPtr rmesa )
+    ALLOC_STATE( fog, fog, FOG_STATE_SIZE, "FOG/fog", 1 );
+    ALLOC_STATE( glt, tcl_lighting, GLT_STATE_SIZE, "GLT/light-global", 1 );
+    ALLOC_STATE( eye, tcl_lighting, EYE_STATE_SIZE, "EYE/eye-vector", 1 );
+-   ALLOC_STATE( tex[0], tex0, TEX_STATE_SIZE, "TEX/tex-0", 0 );
+-   ALLOC_STATE( tex[1], tex1, TEX_STATE_SIZE, "TEX/tex-1", 0 );
+-   ALLOC_STATE( tex[2], tex2, TEX_STATE_SIZE, "TEX/tex-2", 0 );
+-   if (rmesa->radeonScreen->drmSupportsCubeMapsR100)
++   ALLOC_STATE_IDX( tex[0], tex0, TEX_STATE_SIZE, "TEX/tex-0", 0, 0);
++   ALLOC_STATE_IDX( tex[1], tex1, TEX_STATE_SIZE, "TEX/tex-1", 0, 1);
++   ALLOC_STATE_IDX( tex[2], tex2, TEX_STATE_SIZE, "TEX/tex-2", 0, 2 );
++
++   for (i = 0; i < 3; i++)
++     rmesa->hw.tex[i].emit = tex_emit;
++   if (rmesa->radeon.radeonScreen->drmSupportsCubeMapsR100)
+    {
+-      ALLOC_STATE( cube[0], cube0, CUBE_STATE_SIZE, "CUBE/cube-0", 0 );
+-      ALLOC_STATE( cube[1], cube1, CUBE_STATE_SIZE, "CUBE/cube-1", 0 );
+-      ALLOC_STATE( cube[2], cube2, CUBE_STATE_SIZE, "CUBE/cube-2", 0 );
++      ALLOC_STATE_IDX( cube[0], cube0, CUBE_STATE_SIZE, "CUBE/cube-0", 0, 0 );
++      ALLOC_STATE_IDX( cube[1], cube1, CUBE_STATE_SIZE, "CUBE/cube-1", 0, 1 );
++      ALLOC_STATE_IDX( cube[2], cube2, CUBE_STATE_SIZE, "CUBE/cube-2", 0, 2 );
++      for (i = 0; i < 3; i++)
++         rmesa->hw.cube[i].emit = cube_emit;
+    }
+    else
+    {
+-      ALLOC_STATE( cube[0], never, CUBE_STATE_SIZE, "CUBE/cube-0", 0 );
+-      ALLOC_STATE( cube[1], never, CUBE_STATE_SIZE, "CUBE/cube-1", 0 );
+-      ALLOC_STATE( cube[2], never, CUBE_STATE_SIZE, "CUBE/cube-2", 0 );
++      ALLOC_STATE_IDX( cube[0], never, CUBE_STATE_SIZE, "CUBE/cube-0", 0, 0 );
++      ALLOC_STATE_IDX( cube[1], never, CUBE_STATE_SIZE, "CUBE/cube-1", 0, 1 );
++      ALLOC_STATE_IDX( cube[2], never, CUBE_STATE_SIZE, "CUBE/cube-2", 0, 2 );
+    }
+    ALLOC_STATE( mat[0], tcl, MAT_STATE_SIZE, "MAT/modelproject", 1 );
+    ALLOC_STATE( mat[1], tcl_eyespace_or_fog, MAT_STATE_SIZE, "MAT/modelview", 1 );
+@@ -268,43 +638,43 @@ void radeonInitState( radeonContextPtr rmesa )
+    ALLOC_STATE( lit[5], tcl_lit5, LIT_STATE_SIZE, "LIT/light-5", 1 );
+    ALLOC_STATE( lit[6], tcl_lit6, LIT_STATE_SIZE, "LIT/light-6", 1 );
+    ALLOC_STATE( lit[7], tcl_lit7, LIT_STATE_SIZE, "LIT/light-7", 1 );
+-   ALLOC_STATE( txr[0], txr0, TXR_STATE_SIZE, "TXR/txr-0", 0 );
+-   ALLOC_STATE( txr[1], txr1, TXR_STATE_SIZE, "TXR/txr-1", 0 );
+-   ALLOC_STATE( txr[2], txr2, TXR_STATE_SIZE, "TXR/txr-2", 0 );
++   ALLOC_STATE_IDX( txr[0], txr0, TXR_STATE_SIZE, "TXR/txr-0", 0, 0 );
++   ALLOC_STATE_IDX( txr[1], txr1, TXR_STATE_SIZE, "TXR/txr-1", 0, 1 );
++   ALLOC_STATE_IDX( txr[2], txr2, TXR_STATE_SIZE, "TXR/txr-2", 0, 2 );
+ 
+    radeonSetUpAtomList( rmesa );
+ 
+    /* Fill in the packet headers:
+     */
+-   rmesa->hw.ctx.cmd[CTX_CMD_0] = cmdpkt(RADEON_EMIT_PP_MISC);
+-   rmesa->hw.ctx.cmd[CTX_CMD_1] = cmdpkt(RADEON_EMIT_PP_CNTL);
+-   rmesa->hw.ctx.cmd[CTX_CMD_2] = cmdpkt(RADEON_EMIT_RB3D_COLORPITCH);
+-   rmesa->hw.lin.cmd[LIN_CMD_0] = cmdpkt(RADEON_EMIT_RE_LINE_PATTERN);
+-   rmesa->hw.lin.cmd[LIN_CMD_1] = cmdpkt(RADEON_EMIT_SE_LINE_WIDTH);
+-   rmesa->hw.msk.cmd[MSK_CMD_0] = cmdpkt(RADEON_EMIT_RB3D_STENCILREFMASK);
+-   rmesa->hw.vpt.cmd[VPT_CMD_0] = cmdpkt(RADEON_EMIT_SE_VPORT_XSCALE);
+-   rmesa->hw.set.cmd[SET_CMD_0] = cmdpkt(RADEON_EMIT_SE_CNTL);
+-   rmesa->hw.set.cmd[SET_CMD_1] = cmdpkt(RADEON_EMIT_SE_CNTL_STATUS);
+-   rmesa->hw.msc.cmd[MSC_CMD_0] = cmdpkt(RADEON_EMIT_RE_MISC);
+-   rmesa->hw.tex[0].cmd[TEX_CMD_0] = cmdpkt(RADEON_EMIT_PP_TXFILTER_0);
+-   rmesa->hw.tex[0].cmd[TEX_CMD_1] = cmdpkt(RADEON_EMIT_PP_BORDER_COLOR_0);
+-   rmesa->hw.tex[1].cmd[TEX_CMD_0] = cmdpkt(RADEON_EMIT_PP_TXFILTER_1);
+-   rmesa->hw.tex[1].cmd[TEX_CMD_1] = cmdpkt(RADEON_EMIT_PP_BORDER_COLOR_1);
+-   rmesa->hw.tex[2].cmd[TEX_CMD_0] = cmdpkt(RADEON_EMIT_PP_TXFILTER_2);
+-   rmesa->hw.tex[2].cmd[TEX_CMD_1] = cmdpkt(RADEON_EMIT_PP_BORDER_COLOR_2);
+-   rmesa->hw.cube[0].cmd[CUBE_CMD_0] = cmdpkt(RADEON_EMIT_PP_CUBIC_FACES_0);
+-   rmesa->hw.cube[0].cmd[CUBE_CMD_1] = cmdpkt(RADEON_EMIT_PP_CUBIC_OFFSETS_T0);
+-   rmesa->hw.cube[1].cmd[CUBE_CMD_0] = cmdpkt(RADEON_EMIT_PP_CUBIC_FACES_1);
+-   rmesa->hw.cube[1].cmd[CUBE_CMD_1] = cmdpkt(RADEON_EMIT_PP_CUBIC_OFFSETS_T1);
+-   rmesa->hw.cube[2].cmd[CUBE_CMD_0] = cmdpkt(RADEON_EMIT_PP_CUBIC_FACES_2);
+-   rmesa->hw.cube[2].cmd[CUBE_CMD_1] = cmdpkt(RADEON_EMIT_PP_CUBIC_OFFSETS_T2);
+-   rmesa->hw.zbs.cmd[ZBS_CMD_0] = cmdpkt(RADEON_EMIT_SE_ZBIAS_FACTOR);
+-   rmesa->hw.tcl.cmd[TCL_CMD_0] = cmdpkt(RADEON_EMIT_SE_TCL_OUTPUT_VTX_FMT);
++   rmesa->hw.ctx.cmd[CTX_CMD_0] = cmdpkt(rmesa, RADEON_EMIT_PP_MISC);
++   rmesa->hw.ctx.cmd[CTX_CMD_1] = cmdpkt(rmesa, RADEON_EMIT_PP_CNTL);
++   rmesa->hw.ctx.cmd[CTX_CMD_2] = cmdpkt(rmesa, RADEON_EMIT_RB3D_COLORPITCH);
++   rmesa->hw.lin.cmd[LIN_CMD_0] = cmdpkt(rmesa, RADEON_EMIT_RE_LINE_PATTERN);
++   rmesa->hw.lin.cmd[LIN_CMD_1] = cmdpkt(rmesa, RADEON_EMIT_SE_LINE_WIDTH);
++   rmesa->hw.msk.cmd[MSK_CMD_0] = cmdpkt(rmesa, RADEON_EMIT_RB3D_STENCILREFMASK);
++   rmesa->hw.vpt.cmd[VPT_CMD_0] = cmdpkt(rmesa, RADEON_EMIT_SE_VPORT_XSCALE);
++   rmesa->hw.set.cmd[SET_CMD_0] = cmdpkt(rmesa, RADEON_EMIT_SE_CNTL);
++   rmesa->hw.set.cmd[SET_CMD_1] = cmdpkt(rmesa, RADEON_EMIT_SE_CNTL_STATUS);
++   rmesa->hw.msc.cmd[MSC_CMD_0] = cmdpkt(rmesa, RADEON_EMIT_RE_MISC);
++   rmesa->hw.tex[0].cmd[TEX_CMD_0] = cmdpkt(rmesa, RADEON_EMIT_PP_TXFILTER_0);
++   rmesa->hw.tex[0].cmd[TEX_CMD_1] = cmdpkt(rmesa, RADEON_EMIT_PP_BORDER_COLOR_0);
++   rmesa->hw.tex[1].cmd[TEX_CMD_0] = cmdpkt(rmesa, RADEON_EMIT_PP_TXFILTER_1);
++   rmesa->hw.tex[1].cmd[TEX_CMD_1] = cmdpkt(rmesa, RADEON_EMIT_PP_BORDER_COLOR_1);
++   rmesa->hw.tex[2].cmd[TEX_CMD_0] = cmdpkt(rmesa, RADEON_EMIT_PP_TXFILTER_2);
++   rmesa->hw.tex[2].cmd[TEX_CMD_1] = cmdpkt(rmesa, RADEON_EMIT_PP_BORDER_COLOR_2);
++   rmesa->hw.cube[0].cmd[CUBE_CMD_0] = cmdpkt(rmesa, RADEON_EMIT_PP_CUBIC_FACES_0);
++   rmesa->hw.cube[0].cmd[CUBE_CMD_1] = cmdpkt(rmesa, RADEON_EMIT_PP_CUBIC_OFFSETS_T0);
++   rmesa->hw.cube[1].cmd[CUBE_CMD_0] = cmdpkt(rmesa, RADEON_EMIT_PP_CUBIC_FACES_1);
++   rmesa->hw.cube[1].cmd[CUBE_CMD_1] = cmdpkt(rmesa, RADEON_EMIT_PP_CUBIC_OFFSETS_T1);
++   rmesa->hw.cube[2].cmd[CUBE_CMD_0] = cmdpkt(rmesa, RADEON_EMIT_PP_CUBIC_FACES_2);
++   rmesa->hw.cube[2].cmd[CUBE_CMD_1] = cmdpkt(rmesa, RADEON_EMIT_PP_CUBIC_OFFSETS_T2);
++   rmesa->hw.zbs.cmd[ZBS_CMD_0] = cmdpkt(rmesa, RADEON_EMIT_SE_ZBIAS_FACTOR);
++   rmesa->hw.tcl.cmd[TCL_CMD_0] = cmdpkt(rmesa, RADEON_EMIT_SE_TCL_OUTPUT_VTX_FMT);
+    rmesa->hw.mtl.cmd[MTL_CMD_0] = 
+-      cmdpkt(RADEON_EMIT_SE_TCL_MATERIAL_EMMISSIVE_RED);
+-   rmesa->hw.txr[0].cmd[TXR_CMD_0] = cmdpkt(RADEON_EMIT_PP_TEX_SIZE_0);
+-   rmesa->hw.txr[1].cmd[TXR_CMD_0] = cmdpkt(RADEON_EMIT_PP_TEX_SIZE_1);
+-   rmesa->hw.txr[2].cmd[TXR_CMD_0] = cmdpkt(RADEON_EMIT_PP_TEX_SIZE_2);
++      cmdpkt(rmesa, RADEON_EMIT_SE_TCL_MATERIAL_EMMISSIVE_RED);
++   rmesa->hw.txr[0].cmd[TXR_CMD_0] = cmdpkt(rmesa, RADEON_EMIT_PP_TEX_SIZE_0);
++   rmesa->hw.txr[1].cmd[TXR_CMD_0] = cmdpkt(rmesa, RADEON_EMIT_PP_TEX_SIZE_1);
++   rmesa->hw.txr[2].cmd[TXR_CMD_0] = cmdpkt(rmesa, RADEON_EMIT_PP_TEX_SIZE_2);
+    rmesa->hw.grd.cmd[GRD_CMD_0] = 
+       cmdscl( RADEON_SS_VERT_GUARD_CLIP_ADJ_ADDR, 1, 4 );
+    rmesa->hw.fog.cmd[FOG_CMD_0] = 
+@@ -331,6 +701,22 @@ void radeonInitState( radeonContextPtr rmesa )
+ 	 cmdvec( RADEON_VS_UCP_ADDR + i, 1, 4 );
+    }
+ 
++   if (rmesa->radeon.radeonScreen->kernel_mm) {
++      rmesa->hw.grd.emit = scl_emit;
++      rmesa->hw.fog.emit = vec_emit;
++      rmesa->hw.glt.emit = vec_emit;
++      rmesa->hw.eye.emit = vec_emit;
++      
++      for (i = 0; i <= 6; i++)
++	 rmesa->hw.mat[i].emit = vec_emit;
 +
-+	GLuint totalsize; /** total size of the miptree, in bytes */
++      for (i = 0; i < 8; i++)
++	 rmesa->hw.lit[i].emit = lit_emit;
 +
-+	GLenum target; /** GL_TEXTURE_xxx */
-+	GLuint faces; /** # of faces: 6 for cubemaps, 1 otherwise */
-+	GLuint firstLevel; /** First mip level stored in this mipmap tree */
-+	GLuint lastLevel; /** Last mip level stored in this mipmap tree */
++      for (i = 0; i < 6; i++)
++	 rmesa->hw.ucp[i].emit = vec_emit;
++   }
 +
-+	GLuint width0; /** Width of firstLevel image */
-+	GLuint height0; /** Height of firstLevel image */
-+	GLuint depth0; /** Depth of firstLevel image */
+    rmesa->last_ReallyEnabled = -1;
+ 
+    /* Initial Harware state:
+@@ -352,19 +738,7 @@ void radeonInitState( radeonContextPtr rmesa )
+ 					    RADEON_SRC_BLEND_GL_ONE |
+ 					    RADEON_DST_BLEND_GL_ZERO );
+ 
+-   rmesa->hw.ctx.cmd[CTX_RB3D_DEPTHOFFSET] =
+-      rmesa->radeonScreen->depthOffset + rmesa->radeonScreen->fbLocation;
+-
+-   rmesa->hw.ctx.cmd[CTX_RB3D_DEPTHPITCH] = 
+-      ((rmesa->radeonScreen->depthPitch &
+-	RADEON_DEPTHPITCH_MASK) |
+-       RADEON_DEPTH_ENDIAN_NO_SWAP);
+-       
+-   if (rmesa->using_hyperz)
+-       rmesa->hw.ctx.cmd[CTX_RB3D_DEPTHPITCH] |= RADEON_DEPTH_HYPERZ;
+-
+-   rmesa->hw.ctx.cmd[CTX_RB3D_ZSTENCILCNTL] = (depth_fmt |
+-					       RADEON_Z_TEST_LESS |
++   rmesa->hw.ctx.cmd[CTX_RB3D_ZSTENCILCNTL] = (RADEON_Z_TEST_LESS |
+ 					       RADEON_STENCIL_TEST_ALWAYS |
+ 					       RADEON_STENCIL_FAIL_KEEP |
+ 					       RADEON_STENCIL_ZPASS_KEEP |
+@@ -374,7 +748,7 @@ void radeonInitState( radeonContextPtr rmesa )
+    if (rmesa->using_hyperz) {
+        rmesa->hw.ctx.cmd[CTX_RB3D_ZSTENCILCNTL] |= RADEON_Z_COMPRESSION_ENABLE |
+ 						   RADEON_Z_DECOMPRESSION_ENABLE;
+-      if (rmesa->radeonScreen->chip_flags & RADEON_CHIPSET_TCL) {
++      if (rmesa->radeon.radeonScreen->chip_flags & RADEON_CHIPSET_TCL) {
+ 	 /* works for q3, but slight rendering errors with glxgears ? */
+ /*	 rmesa->hw.ctx.cmd[CTX_RB3D_ZSTENCILCNTL] |= RADEON_Z_HIERARCHY_ENABLE;*/
+ 	 /* need this otherwise get lots of lockups with q3 ??? */
+@@ -386,10 +760,9 @@ void radeonInitState( radeonContextPtr rmesa )
+ 				     RADEON_ANTI_ALIAS_NONE);
+ 
+    rmesa->hw.ctx.cmd[CTX_RB3D_CNTL] = (RADEON_PLANE_MASK_ENABLE |
+-				       color_fmt |
+ 				       RADEON_ZBLOCK16);
+ 
+-   switch ( driQueryOptioni( &rmesa->optionCache, "dither_mode" ) ) {
++   switch ( driQueryOptioni( &rmesa->radeon.optionCache, "dither_mode" ) ) {
+    case DRI_CONF_DITHER_XERRORDIFFRESET:
+       rmesa->hw.ctx.cmd[CTX_RB3D_CNTL] |= RADEON_DITHER_INIT;
+       break;
+@@ -397,30 +770,17 @@ void radeonInitState( radeonContextPtr rmesa )
+       rmesa->hw.ctx.cmd[CTX_RB3D_CNTL] |= RADEON_SCALE_DITHER_ENABLE;
+       break;
+    }
+-   if ( driQueryOptioni( &rmesa->optionCache, "round_mode" ) ==
++   if ( driQueryOptioni( &rmesa->radeon.optionCache, "round_mode" ) ==
+ 	DRI_CONF_ROUND_ROUND )
+-      rmesa->state.color.roundEnable = RADEON_ROUND_ENABLE;
++      rmesa->radeon.state.color.roundEnable = RADEON_ROUND_ENABLE;
+    else
+-      rmesa->state.color.roundEnable = 0;
+-   if ( driQueryOptioni (&rmesa->optionCache, "color_reduction" ) ==
++      rmesa->radeon.state.color.roundEnable = 0;
++   if ( driQueryOptioni (&rmesa->radeon.optionCache, "color_reduction" ) ==
+ 	DRI_CONF_COLOR_REDUCTION_DITHER )
+       rmesa->hw.ctx.cmd[CTX_RB3D_CNTL] |= RADEON_DITHER_ENABLE;
+    else
+-      rmesa->hw.ctx.cmd[CTX_RB3D_CNTL] |= rmesa->state.color.roundEnable;
++      rmesa->hw.ctx.cmd[CTX_RB3D_CNTL] |= rmesa->radeon.state.color.roundEnable;
+ 
+-   rmesa->hw.ctx.cmd[CTX_RB3D_COLOROFFSET] = ((drawOffset +
+-					       rmesa->radeonScreen->fbLocation)
+-					      & RADEON_COLOROFFSET_MASK);
+-
+-   rmesa->hw.ctx.cmd[CTX_RB3D_COLORPITCH] = ((drawPitch &
+-					      RADEON_COLORPITCH_MASK) |
+-					     RADEON_COLOR_ENDIAN_NO_SWAP);
+-
+-
+-   /* (fixed size) sarea is initialized to zero afaics so can omit version check. Phew! */
+-   if (rmesa->sarea->tiling_enabled) {
+-      rmesa->hw.ctx.cmd[CTX_RB3D_COLORPITCH] |= RADEON_COLOR_TILE_ENABLE;
+-   }
+ 
+    rmesa->hw.set.cmd[SET_SE_CNTL] = (RADEON_FFACE_CULL_CCW |
+ 				     RADEON_BFACE_SOLID |
+@@ -444,7 +804,7 @@ void radeonInitState( radeonContextPtr rmesa )
+   					    RADEON_VC_NO_SWAP;
+ #endif
+ 
+-   if (!(rmesa->radeonScreen->chip_flags & RADEON_CHIPSET_TCL)) {
++   if (!(rmesa->radeon.radeonScreen->chip_flags & RADEON_CHIPSET_TCL)) {
+      rmesa->hw.set.cmd[SET_SE_CNTL_STATUS] |= RADEON_TCL_BYPASS;
+    }
+ 
+@@ -491,8 +851,8 @@ void radeonInitState( radeonContextPtr rmesa )
+ 	   (2 << RADEON_TXFORMAT_HEIGHT_SHIFT));
+ 
+       /* Initialize the texture offset to the start of the card texture heap */
+-      rmesa->hw.tex[i].cmd[TEX_PP_TXOFFSET] =
+-	  rmesa->radeonScreen->texOffset[RADEON_LOCAL_TEX_HEAP];
++      //      rmesa->hw.tex[i].cmd[TEX_PP_TXOFFSET] =
++      //	  rmesa->radeon.radeonScreen->texOffset[RADEON_LOCAL_TEX_HEAP];
+ 
+       rmesa->hw.tex[i].cmd[TEX_PP_BORDER_COLOR] = 0;
+       rmesa->hw.tex[i].cmd[TEX_PP_TXCBLEND] =  
+@@ -513,15 +873,15 @@ void radeonInitState( radeonContextPtr rmesa )
+ 
+       rmesa->hw.cube[i].cmd[CUBE_PP_CUBIC_FACES] = 0;
+       rmesa->hw.cube[i].cmd[CUBE_PP_CUBIC_OFFSET_0] =
+-	  rmesa->radeonScreen->texOffset[RADEON_LOCAL_TEX_HEAP];
++	  rmesa->radeon.radeonScreen->texOffset[RADEON_LOCAL_TEX_HEAP];
+       rmesa->hw.cube[i].cmd[CUBE_PP_CUBIC_OFFSET_1] =
+-	  rmesa->radeonScreen->texOffset[RADEON_LOCAL_TEX_HEAP];
++	  rmesa->radeon.radeonScreen->texOffset[RADEON_LOCAL_TEX_HEAP];
+       rmesa->hw.cube[i].cmd[CUBE_PP_CUBIC_OFFSET_2] =
+-	  rmesa->radeonScreen->texOffset[RADEON_LOCAL_TEX_HEAP];
++	  rmesa->radeon.radeonScreen->texOffset[RADEON_LOCAL_TEX_HEAP];
+       rmesa->hw.cube[i].cmd[CUBE_PP_CUBIC_OFFSET_3] =
+-	  rmesa->radeonScreen->texOffset[RADEON_LOCAL_TEX_HEAP];
++	  rmesa->radeon.radeonScreen->texOffset[RADEON_LOCAL_TEX_HEAP];
+       rmesa->hw.cube[i].cmd[CUBE_PP_CUBIC_OFFSET_4] =
+-	  rmesa->radeonScreen->texOffset[RADEON_LOCAL_TEX_HEAP];
++	  rmesa->radeon.radeonScreen->texOffset[RADEON_LOCAL_TEX_HEAP];
+    }
+ 
+    /* Can only add ST1 at the time of doing some multitex but can keep
+@@ -613,5 +973,7 @@ void radeonInitState( radeonContextPtr rmesa )
+    rmesa->hw.eye.cmd[EYE_Z] = IEEE_ONE;
+    rmesa->hw.eye.cmd[EYE_RESCALE_FACTOR] = IEEE_ONE;
+    
+-   rmesa->hw.all_dirty = GL_TRUE;
++   rmesa->radeon.hw.all_dirty = GL_TRUE;
++
++   rcommonInitCmdBuf(&rmesa->radeon);
+ }
+diff --git a/src/mesa/drivers/dri/radeon/radeon_swtcl.c b/src/mesa/drivers/dri/radeon/radeon_swtcl.c
+index ebea1fe..af933a3 100644
+--- a/src/mesa/drivers/dri/radeon/radeon_swtcl.c
++++ b/src/mesa/drivers/dri/radeon/radeon_swtcl.c
+@@ -52,8 +52,6 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ #include "radeon_tcl.h"
+ 
+ 
+-static void flush_last_swtcl_prim( radeonContextPtr rmesa  );
+-
+ /* R100: xyzw, c0, c1/fog, stq[0..2]  = 4+1+1+3*3 = 15  right? */
+ /* R200: xyzw, c0, c1/fog, strq[0..5] = 4+1+1+4*6 = 30 */
+ #define RADEON_MAX_TNL_VERTEX_SIZE (15 * sizeof(GLfloat))	/* for mesa _tnl stage */
+@@ -64,18 +62,18 @@ static void flush_last_swtcl_prim( radeonContextPtr rmesa  );
+ 
+ #define EMIT_ATTR( ATTR, STYLE, F0 )					\
+ do {									\
+-   rmesa->swtcl.vertex_attrs[rmesa->swtcl.vertex_attr_count].attrib = (ATTR);	\
+-   rmesa->swtcl.vertex_attrs[rmesa->swtcl.vertex_attr_count].format = (STYLE);	\
+-   rmesa->swtcl.vertex_attr_count++;					\
++   rmesa->radeon.swtcl.vertex_attrs[rmesa->radeon.swtcl.vertex_attr_count].attrib = (ATTR);	\
++   rmesa->radeon.swtcl.vertex_attrs[rmesa->radeon.swtcl.vertex_attr_count].format = (STYLE);	\
++   rmesa->radeon.swtcl.vertex_attr_count++;					\
+    fmt_0 |= F0;								\
+ } while (0)
+ 
+ #define EMIT_PAD( N )							\
+ do {									\
+-   rmesa->swtcl.vertex_attrs[rmesa->swtcl.vertex_attr_count].attrib = 0;		\
+-   rmesa->swtcl.vertex_attrs[rmesa->swtcl.vertex_attr_count].format = EMIT_PAD;	\
+-   rmesa->swtcl.vertex_attrs[rmesa->swtcl.vertex_attr_count].offset = (N);		\
+-   rmesa->swtcl.vertex_attr_count++;					\
++   rmesa->radeon.swtcl.vertex_attrs[rmesa->radeon.swtcl.vertex_attr_count].attrib = 0;		\
++   rmesa->radeon.swtcl.vertex_attrs[rmesa->radeon.swtcl.vertex_attr_count].format = EMIT_PAD;	\
++   rmesa->radeon.swtcl.vertex_attrs[rmesa->radeon.swtcl.vertex_attr_count].offset = (N);		\
++   rmesa->radeon.swtcl.vertex_attr_count++;					\
+ } while (0)
+ 
+ static GLuint radeon_cp_vc_frmts[3][2] =
+@@ -87,7 +85,7 @@ static GLuint radeon_cp_vc_frmts[3][2] =
+ 
+ static void radeonSetVertexFormat( GLcontext *ctx )
+ {
+-   radeonContextPtr rmesa = RADEON_CONTEXT( ctx );
++   r100ContextPtr rmesa = R100_CONTEXT( ctx );
+    TNLcontext *tnl = TNL_CONTEXT(ctx);
+    struct vertex_buffer *VB = &tnl->vb;
+    DECLARE_RENDERINPUTS(index_bitset);
+@@ -106,7 +104,7 @@ static void radeonSetVertexFormat( GLcontext *ctx )
+    }
+ 
+    assert( VB->AttribPtr[VERT_ATTRIB_POS] != NULL );
+-   rmesa->swtcl.vertex_attr_count = 0;
++   rmesa->radeon.swtcl.vertex_attr_count = 0;
+ 
+    /* EMIT_ATTR's must be in order as they tell t_vertex.c how to
+     * build up a hardware vertex.
+@@ -204,33 +202,33 @@ static void radeonSetVertexFormat( GLcontext *ctx )
+       }
+    }
+ 
+-   if (!RENDERINPUTS_EQUAL( rmesa->tnl_index_bitset, index_bitset ) ||
++   if (!RENDERINPUTS_EQUAL( rmesa->radeon.tnl_index_bitset, index_bitset ) ||
+ 	fmt_0 != rmesa->swtcl.vertex_format) {
+       RADEON_NEWPRIM(rmesa);
+       rmesa->swtcl.vertex_format = fmt_0;
+-      rmesa->swtcl.vertex_size =
++      rmesa->radeon.swtcl.vertex_size =
+ 	  _tnl_install_attrs( ctx,
+-			      rmesa->swtcl.vertex_attrs, 
+-			      rmesa->swtcl.vertex_attr_count,
++			      rmesa->radeon.swtcl.vertex_attrs, 
++			      rmesa->radeon.swtcl.vertex_attr_count,
+ 			      NULL, 0 );
+-      rmesa->swtcl.vertex_size /= 4;
+-      RENDERINPUTS_COPY( rmesa->tnl_index_bitset, index_bitset );
++      rmesa->radeon.swtcl.vertex_size /= 4;
++      RENDERINPUTS_COPY( rmesa->radeon.tnl_index_bitset, index_bitset );
+       if (RADEON_DEBUG & DEBUG_VERTS)
+ 	 fprintf( stderr, "%s: vertex_size= %d floats\n",
+-		  __FUNCTION__, rmesa->swtcl.vertex_size);
++		  __FUNCTION__, rmesa->radeon.swtcl.vertex_size);
+    }
+ }
+ 
+ 
+ static void radeonRenderStart( GLcontext *ctx )
+ {
+-   radeonContextPtr rmesa = RADEON_CONTEXT( ctx );
++   r100ContextPtr rmesa = R100_CONTEXT( ctx );
+ 
+    radeonSetVertexFormat( ctx );
+    
+-   if (rmesa->dma.flush != 0 && 
+-       rmesa->dma.flush != flush_last_swtcl_prim)
+-      rmesa->dma.flush( rmesa );
++   if (rmesa->radeon.dma.flush != 0 && 
++       rmesa->radeon.dma.flush != rcommon_flush_last_swtcl_prim)
++      rmesa->radeon.dma.flush( ctx );
+ }
+ 
+ 
+@@ -241,7 +239,7 @@ static void radeonRenderStart( GLcontext *ctx )
+  */
+ void radeonChooseVertexState( GLcontext *ctx )
+ {
+-   radeonContextPtr rmesa = RADEON_CONTEXT( ctx );
++   r100ContextPtr rmesa = R100_CONTEXT( ctx );
+    TNLcontext *tnl = TNL_CONTEXT(ctx);
+ 
+    GLuint se_coord_fmt = rmesa->hw.set.cmd[SET_SE_COORDFMT];
+@@ -254,7 +252,7 @@ void radeonChooseVertexState( GLcontext *ctx )
+     * rasterization fallback.  As this function will be called again when we
+     * leave a rasterization fallback, we can just skip it for now.
+     */
+-   if (rmesa->Fallback != 0)
++   if (rmesa->radeon.Fallback != 0)
+       return;
+ 
+    /* HW perspective divide is a win, but tiny vertex formats are a
+@@ -281,80 +279,29 @@ void radeonChooseVertexState( GLcontext *ctx )
+    }
+ }
+ 
+-
+-/* Flush vertices in the current dma region.
+- */
+-static void flush_last_swtcl_prim( radeonContextPtr rmesa  )
++void r100_swtcl_flush(GLcontext *ctx, uint32_t current_offset)
+ {
+-   if (RADEON_DEBUG & DEBUG_IOCTL)
+-      fprintf(stderr, "%s\n", __FUNCTION__);
+-
+-   rmesa->dma.flush = NULL;
+-
+-   if (rmesa->dma.current.buf) {
+-      struct radeon_dma_region *current = &rmesa->dma.current;
+-      GLuint current_offset = (rmesa->radeonScreen->gart_buffer_offset +
+-			       current->buf->buf->idx * RADEON_BUFFER_SIZE + 
+-			       current->start);
+-
+-      assert (!(rmesa->swtcl.hw_primitive & RADEON_CP_VC_CNTL_PRIM_WALK_IND));
++   r100ContextPtr rmesa = R100_CONTEXT(ctx);
+ 
+-      assert (current->start + 
+-	      rmesa->swtcl.numverts * rmesa->swtcl.vertex_size * 4 ==
+-	      current->ptr);
++   rcommonEnsureCmdBufSpace(&rmesa->radeon,
++			    rmesa->radeon.hw.max_state_size + (12*sizeof(int)),
++			    __FUNCTION__);
+ 
+-      if (rmesa->dma.current.start != rmesa->dma.current.ptr) {
+-	 radeonEnsureCmdBufSpace( rmesa, VERT_AOS_BUFSZ +
+-			          rmesa->hw.max_state_size + VBUF_BUFSZ );
+ 
+-	 radeonEmitVertexAOS( rmesa,
+-			      rmesa->swtcl.vertex_size,
+-			      current_offset);
++   radeonEmitState(&rmesa->radeon);
++   radeonEmitVertexAOS( rmesa,
++			rmesa->radeon.swtcl.vertex_size,
++			rmesa->radeon.dma.current,
++			current_offset);
+ 
+-	 radeonEmitVbufPrim( rmesa,
+-			     rmesa->swtcl.vertex_format,
+-			     rmesa->swtcl.hw_primitive,
+-			     rmesa->swtcl.numverts);
+-      }
++		      
++   radeonEmitVbufPrim( rmesa,
++		       rmesa->swtcl.vertex_format,
++		       rmesa->radeon.swtcl.hw_primitive,
++		       rmesa->radeon.swtcl.numverts);
+ 
+-      rmesa->swtcl.numverts = 0;
+-      current->start = current->ptr;
+-   }
+ }
+ 
+-
+-/* Alloc space in the current dma region.
+- */
+-static INLINE void *
+-radeonAllocDmaLowVerts( radeonContextPtr rmesa, int nverts, int vsize )
+-{
+-   GLuint bytes = vsize * nverts;
+-
+-   if ( rmesa->dma.current.ptr + bytes > rmesa->dma.current.end ) 
+-      radeonRefillCurrentDmaRegion( rmesa );
+-
+-   if (!rmesa->dma.flush) {
+-      rmesa->glCtx->Driver.NeedFlush |= FLUSH_STORED_VERTICES;
+-      rmesa->dma.flush = flush_last_swtcl_prim;
+-   }
+-
+-   assert( vsize == rmesa->swtcl.vertex_size * 4 );
+-   assert( rmesa->dma.flush == flush_last_swtcl_prim );
+-   assert (rmesa->dma.current.start + 
+-	   rmesa->swtcl.numverts * rmesa->swtcl.vertex_size * 4 ==
+-	   rmesa->dma.current.ptr);
+-
+-
+-   {
+-      GLubyte *head = (GLubyte *)(rmesa->dma.current.address + rmesa->dma.current.ptr);
+-      rmesa->dma.current.ptr += bytes;
+-      rmesa->swtcl.numverts += nverts;
+-      return head;
+-   }
+-
+-}
+-
+-
+ /*
+  * Render unclipped vertex buffers by emitting vertices directly to
+  * dma buffers.  Use strip/fan hardware primitives where possible.
+@@ -387,22 +334,22 @@ static const GLuint hw_prim[GL_POLYGON+1] = {
+ };
+ 
+ static INLINE void
+-radeonDmaPrimitive( radeonContextPtr rmesa, GLenum prim )
++radeonDmaPrimitive( r100ContextPtr rmesa, GLenum prim )
+ {
+    RADEON_NEWPRIM( rmesa );
+-   rmesa->swtcl.hw_primitive = hw_prim[prim];
+-   assert(rmesa->dma.current.ptr == rmesa->dma.current.start);
++   rmesa->radeon.swtcl.hw_primitive = hw_prim[prim];
++   //   assert(rmesa->radeon.dma.current.ptr == rmesa->radeon.dma.current.start);
+ }
+ 
+-#define LOCAL_VARS radeonContextPtr rmesa = RADEON_CONTEXT(ctx)
++#define LOCAL_VARS r100ContextPtr rmesa = R100_CONTEXT(ctx)
+ #define INIT( prim ) radeonDmaPrimitive( rmesa, prim )
+ #define FLUSH()  RADEON_NEWPRIM( rmesa )
+-#define GET_CURRENT_VB_MAX_VERTS() \
+-  (((int)rmesa->dma.current.end - (int)rmesa->dma.current.ptr) / (rmesa->swtcl.vertex_size*4))
++#define GET_CURRENT_VB_MAX_VERTS()					10\
++//  (((int)rmesa->radeon.dma.current.end - (int)rmesa->radeon.dma.current.ptr) / (rmesa->radeon.swtcl.vertex_size*4))
+ #define GET_SUBSEQUENT_VB_MAX_VERTS() \
+-  ((RADEON_BUFFER_SIZE) / (rmesa->swtcl.vertex_size*4))
++  ((RADEON_BUFFER_SIZE) / (rmesa->radeon.swtcl.vertex_size*4))
+ #define ALLOC_VERTS( nr ) \
+-  radeonAllocDmaLowVerts( rmesa, nr, rmesa->swtcl.vertex_size * 4 )
++  rcommonAllocDmaLowVerts( &rmesa->radeon, nr, rmesa->radeon.swtcl.vertex_size * 4 )
+ #define EMIT_VERTS( ctx, j, nr, buf ) \
+   _tnl_emit_vertices_to_buffer(ctx, j, (j)+(nr), buf)
+ 
+@@ -418,16 +365,13 @@ radeonDmaPrimitive( radeonContextPtr rmesa, GLenum prim )
+ static GLboolean radeon_run_render( GLcontext *ctx,
+ 				    struct tnl_pipeline_stage *stage )
+ {
+-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
++   r100ContextPtr rmesa = R100_CONTEXT(ctx);
+    TNLcontext *tnl = TNL_CONTEXT(ctx);
+    struct vertex_buffer *VB = &tnl->vb;
+    tnl_render_func *tab = TAG(render_tab_verts);
+    GLuint i;
+ 
+-   if (rmesa->swtcl.indexed_verts.buf) 
+-      RELEASE_ELT_VERTS();
+-   	
+-   if (rmesa->swtcl.RenderIndex != 0 ||   
++   if (rmesa->radeon.swtcl.RenderIndex != 0 ||   
+        !radeon_dma_validate_render( ctx, VB ))
+       return GL_TRUE;		
+ 
+@@ -496,13 +440,13 @@ static void radeonResetLineStipple( GLcontext *ctx );
+ 
+ #undef LOCAL_VARS
+ #undef ALLOC_VERTS
+-#define CTX_ARG radeonContextPtr rmesa
+-#define GET_VERTEX_DWORDS() rmesa->swtcl.vertex_size
+-#define ALLOC_VERTS( n, size ) radeonAllocDmaLowVerts( rmesa, n, (size) * 4 )
++#define CTX_ARG r100ContextPtr rmesa
++#define GET_VERTEX_DWORDS() rmesa->radeon.swtcl.vertex_size
++#define ALLOC_VERTS( n, size ) rcommonAllocDmaLowVerts( &rmesa->radeon, n, (size) * 4 )
+ #undef LOCAL_VARS
+ #define LOCAL_VARS						\
+-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);		\
+-   const char *radeonverts = (char *)rmesa->swtcl.verts;
++   r100ContextPtr rmesa = R100_CONTEXT(ctx);		\
++   const char *radeonverts = (char *)rmesa->radeon.swtcl.verts;
+ #define VERT(x) (radeonVertex *)(radeonverts + ((x) * (vertsize) * sizeof(int)))
+ #define VERTEX radeonVertex 
+ #undef TAG
+@@ -560,7 +504,7 @@ static struct {
+ #define VERT_Y(_v) _v->v.y
+ #define VERT_Z(_v) _v->v.z
+ #define AREA_IS_CCW( a ) (a < 0)
+-#define GET_VERTEX(e) (rmesa->swtcl.verts + ((e) * rmesa->swtcl.vertex_size * sizeof(int)))
++#define GET_VERTEX(e) (rmesa->radeon.swtcl.verts + ((e) * rmesa->radeon.swtcl.vertex_size * sizeof(int)))
+ 
+ #define VERT_SET_RGBA( v, c )  					\
+ do {								\
+@@ -606,7 +550,7 @@ do {							\
+ #undef INIT
+ 
+ #define LOCAL_VARS(n)							\
+-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);			\
++   r100ContextPtr rmesa = R100_CONTEXT(ctx);			\
+    GLuint color[n], spec[n];						\
+    GLuint coloroffset = rmesa->swtcl.coloroffset;	\
+    GLuint specoffset = rmesa->swtcl.specoffset;			\
+@@ -617,7 +561,7 @@ do {							\
+  ***********************************************************************/
+ 
+ #define RASTERIZE(x) radeonRasterPrimitive( ctx, reduced_hw_prim[x] )
+-#define RENDER_PRIMITIVE rmesa->swtcl.render_primitive
++#define RENDER_PRIMITIVE rmesa->radeon.swtcl.render_primitive
+ #undef TAG
+ #define TAG(x) x
+ #include "tnl_dd/t_dd_unfilled.h"
+@@ -673,9 +617,9 @@ static void init_rast_tab( void )
+ } while (0)
+ #undef LOCAL_VARS
+ #define LOCAL_VARS						\
+-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);		\
+-   const GLuint vertsize = rmesa->swtcl.vertex_size;		\
+-   const char *radeonverts = (char *)rmesa->swtcl.verts;		\
++   r100ContextPtr rmesa = R100_CONTEXT(ctx);		\
++   const GLuint vertsize = rmesa->radeon.swtcl.vertex_size;		\
++   const char *radeonverts = (char *)rmesa->radeon.swtcl.verts;		\
+    const GLuint * const elt = TNL_CONTEXT(ctx)->vb.Elts;	\
+    const GLboolean stipple = ctx->Line.StippleFlag;		\
+    (void) elt; (void) stipple;
+@@ -700,17 +644,17 @@ static void init_rast_tab( void )
+ void radeonChooseRenderState( GLcontext *ctx )
+ {
+    TNLcontext *tnl = TNL_CONTEXT(ctx);
+-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
++   r100ContextPtr rmesa = R100_CONTEXT(ctx);
+    GLuint index = 0;
+    GLuint flags = ctx->_TriangleCaps;
+ 
+-   if (!rmesa->TclFallback || rmesa->Fallback) 
++   if (!rmesa->radeon.TclFallback || rmesa->radeon.Fallback) 
+       return;
+ 
+    if (flags & DD_TRI_LIGHT_TWOSIDE) index |= RADEON_TWOSIDE_BIT;
+    if (flags & DD_TRI_UNFILLED)      index |= RADEON_UNFILLED_BIT;
+ 
+-   if (index != rmesa->swtcl.RenderIndex) {
++   if (index != rmesa->radeon.swtcl.RenderIndex) {
+       tnl->Driver.Render.Points = rast_tab[index].points;
+       tnl->Driver.Render.Line = rast_tab[index].line;
+       tnl->Driver.Render.ClippedLine = rast_tab[index].line;
+@@ -727,7 +671,7 @@ void radeonChooseRenderState( GLcontext *ctx )
+ 	 tnl->Driver.Render.ClippedPolygon = _tnl_RenderClippedPolygon;
+       }
+ 
+-      rmesa->swtcl.RenderIndex = index;
++      rmesa->radeon.swtcl.RenderIndex = index;
+    }
+ }
+ 
+@@ -739,18 +683,18 @@ void radeonChooseRenderState( GLcontext *ctx )
+ 
+ static void radeonRasterPrimitive( GLcontext *ctx, GLuint hwprim )
+ {
+-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
++   r100ContextPtr rmesa = R100_CONTEXT(ctx);
+ 
+-   if (rmesa->swtcl.hw_primitive != hwprim) {
++   if (rmesa->radeon.swtcl.hw_primitive != hwprim) {
+       RADEON_NEWPRIM( rmesa );
+-      rmesa->swtcl.hw_primitive = hwprim;
++      rmesa->radeon.swtcl.hw_primitive = hwprim;
+    }
+ }
+ 
+ static void radeonRenderPrimitive( GLcontext *ctx, GLenum prim )
+ {
+-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+-   rmesa->swtcl.render_primitive = prim;
++   r100ContextPtr rmesa = R100_CONTEXT(ctx);
++   rmesa->radeon.swtcl.render_primitive = prim;
+    if (prim < GL_TRIANGLES || !(ctx->_TriangleCaps & DD_TRI_UNFILLED)) 
+       radeonRasterPrimitive( ctx, reduced_hw_prim[prim] );
+ }
+@@ -761,7 +705,7 @@ static void radeonRenderFinish( GLcontext *ctx )
+ 
+ static void radeonResetLineStipple( GLcontext *ctx )
+ {
+-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
++   r100ContextPtr rmesa = R100_CONTEXT(ctx);
+    RADEON_STATECHANGE( rmesa, lin );
+ }
+ 
+@@ -795,17 +739,17 @@ static const char *getFallbackString(GLuint bit)
+ 
+ void radeonFallback( GLcontext *ctx, GLuint bit, GLboolean mode )
+ {
+-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
++   r100ContextPtr rmesa = R100_CONTEXT(ctx);
+    TNLcontext *tnl = TNL_CONTEXT(ctx);
+-   GLuint oldfallback = rmesa->Fallback;
++   GLuint oldfallback = rmesa->radeon.Fallback;
+ 
+    if (mode) {
+-      rmesa->Fallback |= bit;
++      rmesa->radeon.Fallback |= bit;
+       if (oldfallback == 0) {
+-	 RADEON_FIREVERTICES( rmesa );
++	 radeon_firevertices(&rmesa->radeon);
+ 	 TCL_FALLBACK( ctx, RADEON_TCL_FALLBACK_RASTER, GL_TRUE );
+ 	 _swsetup_Wakeup( ctx );
+-	 rmesa->swtcl.RenderIndex = ~0;
++	 rmesa->radeon.swtcl.RenderIndex = ~0;
+          if (RADEON_DEBUG & DEBUG_FALLBACKS) {
+             fprintf(stderr, "Radeon begin rasterization fallback: 0x%x %s\n",
+                     bit, getFallbackString(bit));
+@@ -813,7 +757,7 @@ void radeonFallback( GLcontext *ctx, GLuint bit, GLboolean mode )
+       }
+    }
+    else {
+-      rmesa->Fallback &= ~bit;
++      rmesa->radeon.Fallback &= ~bit;
+       if (oldfallback == bit) {
+ 	 _swrast_flush( ctx );
+ 	 tnl->Driver.Render.Start = radeonRenderStart;
+@@ -826,14 +770,14 @@ void radeonFallback( GLcontext *ctx, GLuint bit, GLboolean mode )
+ 
+ 	 tnl->Driver.Render.ResetLineStipple = radeonResetLineStipple;
+ 	 TCL_FALLBACK( ctx, RADEON_TCL_FALLBACK_RASTER, GL_FALSE );
+-	 if (rmesa->TclFallback) {
+-	    /* These are already done if rmesa->TclFallback goes to
++	 if (rmesa->radeon.TclFallback) {
++	    /* These are already done if rmesa->radeon.TclFallback goes to
+ 	     * zero above. But not if it doesn't (RADEON_NO_TCL for
+ 	     * example?)
+ 	     */
+ 	    _tnl_invalidate_vertex_state( ctx, ~0 );
+ 	    _tnl_invalidate_vertices( ctx, ~0 );
+-	    RENDERINPUTS_ZERO( rmesa->tnl_index_bitset );
++	    RENDERINPUTS_ZERO( rmesa->radeon.tnl_index_bitset );
+ 	    radeonChooseVertexState( ctx );
+ 	    radeonChooseRenderState( ctx );
+ 	 }
+@@ -853,7 +797,7 @@ void radeonFallback( GLcontext *ctx, GLuint bit, GLboolean mode )
+ void radeonInitSwtcl( GLcontext *ctx )
+ {
+    TNLcontext *tnl = TNL_CONTEXT(ctx);
+-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
++   r100ContextPtr rmesa = R100_CONTEXT(ctx);
+    static int firsttime = 1;
+ 
+    if (firsttime) {
+@@ -872,18 +816,15 @@ void radeonInitSwtcl( GLcontext *ctx )
+    _tnl_init_vertices( ctx, ctx->Const.MaxArrayLockSize + 12, 
+ 		       RADEON_MAX_TNL_VERTEX_SIZE);
+    
+-   rmesa->swtcl.verts = (GLubyte *)tnl->clipspace.vertex_buf;
+-   rmesa->swtcl.RenderIndex = ~0;
+-   rmesa->swtcl.render_primitive = GL_TRIANGLES;
+-   rmesa->swtcl.hw_primitive = 0;
++   rmesa->radeon.swtcl.verts = (GLubyte *)tnl->clipspace.vertex_buf;
++   rmesa->radeon.swtcl.RenderIndex = ~0;
++   rmesa->radeon.swtcl.render_primitive = GL_TRIANGLES;
++   rmesa->radeon.swtcl.hw_primitive = 0;
+ }
+ 
+ 
+ void radeonDestroySwtcl( GLcontext *ctx )
+ {
+-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
++   r100ContextPtr rmesa = R100_CONTEXT(ctx);
+ 
+-   if (rmesa->swtcl.indexed_verts.buf) 
+-      radeonReleaseDmaRegion( rmesa, &rmesa->swtcl.indexed_verts, 
+-			      __FUNCTION__ );
+ }
+diff --git a/src/mesa/drivers/dri/radeon/radeon_swtcl.h b/src/mesa/drivers/dri/radeon/radeon_swtcl.h
+index e485052..3ada989 100644
+--- a/src/mesa/drivers/dri/radeon/radeon_swtcl.h
++++ b/src/mesa/drivers/dri/radeon/radeon_swtcl.h
+@@ -63,5 +63,5 @@ extern void radeon_translate_vertex( GLcontext *ctx,
+ 
+ extern void radeon_print_vertex( GLcontext *ctx, const radeonVertex *v );
+ 
+-
++extern void r100_swtcl_flush(GLcontext *ctx, uint32_t current_offset);
+ #endif
+diff --git a/src/mesa/drivers/dri/radeon/radeon_tcl.c b/src/mesa/drivers/dri/radeon/radeon_tcl.c
+index 779e9ae..5887ab3 100644
+--- a/src/mesa/drivers/dri/radeon/radeon_tcl.c
++++ b/src/mesa/drivers/dri/radeon/radeon_tcl.c
+@@ -42,6 +42,7 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ #include "tnl/tnl.h"
+ #include "tnl/t_pipeline.h"
+ 
++#include "radeon_common.h"
+ #include "radeon_context.h"
+ #include "radeon_state.h"
+ #include "radeon_ioctl.h"
+@@ -104,7 +105,7 @@ static GLboolean discrete_prim[0x10] = {
+ };
+    
+ 
+-#define LOCAL_VARS radeonContextPtr rmesa = RADEON_CONTEXT(ctx)
++#define LOCAL_VARS r100ContextPtr rmesa = R100_CONTEXT(ctx)
+ #define ELT_TYPE  GLushort
+ 
+ #define ELT_INIT(prim, hw_prim) \
+@@ -125,7 +126,7 @@ static GLboolean discrete_prim[0x10] = {
+ 
+ #define RESET_STIPPLE() do {			\
+    RADEON_STATECHANGE( rmesa, lin );		\
+-   radeonEmitState( rmesa );			\
++   radeonEmitState(&rmesa->radeon);			\
+ } while (0)
+ 
+ #define AUTO_STIPPLE( mode )  do {		\
+@@ -136,31 +137,29 @@ static GLboolean discrete_prim[0x10] = {
+    else						\
+       rmesa->hw.lin.cmd[LIN_RE_LINE_PATTERN] &=	\
+ 	 ~RADEON_LINE_PATTERN_AUTO_RESET;	\
+-   radeonEmitState( rmesa );			\
++   radeonEmitState(&rmesa->radeon);		\
+ } while (0)
+ 
+ 
+ 
+ #define ALLOC_ELTS(nr)	radeonAllocElts( rmesa, nr )
+ 
+-static GLushort *radeonAllocElts( radeonContextPtr rmesa, GLuint nr ) 
++static GLushort *radeonAllocElts( r100ContextPtr rmesa, GLuint nr ) 
+ {
+-   if (rmesa->dma.flush)
+-      rmesa->dma.flush( rmesa );
++      if (rmesa->radeon.dma.flush)
++	 rmesa->radeon.dma.flush( rmesa->radeon.glCtx );
+ 
+-   radeonEnsureCmdBufSpace(rmesa, AOS_BUFSZ(rmesa->tcl.nr_aos_components) +
+-			   rmesa->hw.max_state_size + ELTS_BUFSZ(nr));
++      rcommonEnsureCmdBufSpace(&rmesa->radeon, rmesa->radeon.hw.max_state_size + ELTS_BUFSZ(nr) + 
++			       AOS_BUFSZ(rmesa->tcl.nr_aos_components), __FUNCTION__);
+ 
+-   radeonEmitAOS( rmesa,
+-		rmesa->tcl.aos_components,
+-		rmesa->tcl.nr_aos_components, 0 );
++      radeonEmitAOS( rmesa,
++		     rmesa->tcl.nr_aos_components, 0 );
+ 
+-   return radeonAllocEltsOpenEnded( rmesa,
+-				    rmesa->tcl.vertex_format, 
+-				    rmesa->tcl.hw_primitive, nr );
++      return radeonAllocEltsOpenEnded( rmesa, rmesa->tcl.vertex_format,
++				       rmesa->tcl.hw_primitive, nr );
+ }
+ 
+-#define CLOSE_ELTS()  RADEON_NEWPRIM( rmesa )
++#define CLOSE_ELTS() if (0)  RADEON_NEWPRIM( rmesa )
+ 
+ 
+ 
+@@ -174,14 +173,14 @@ static void radeonEmitPrim( GLcontext *ctx,
+ 		       GLuint start, 
+ 		       GLuint count)	
+ {
+-   radeonContextPtr rmesa = RADEON_CONTEXT( ctx );
++   r100ContextPtr rmesa = R100_CONTEXT( ctx );
+    radeonTclPrimitive( ctx, prim, hwprim );
+    
+-   radeonEnsureCmdBufSpace( rmesa, AOS_BUFSZ(rmesa->tcl.nr_aos_components) +
+-			    rmesa->hw.max_state_size + VBUF_BUFSZ );
++   rcommonEnsureCmdBufSpace( &rmesa->radeon,
++			     AOS_BUFSZ(rmesa->tcl.nr_aos_components) +
++			     rmesa->radeon.hw.max_state_size + VBUF_BUFSZ, __FUNCTION__ );
+ 
+    radeonEmitAOS( rmesa,
+-		  rmesa->tcl.aos_components,
+ 		  rmesa->tcl.nr_aos_components,
+ 		  start );
+    
+@@ -254,7 +253,7 @@ void radeonTclPrimitive( GLcontext *ctx,
+ 			 GLenum prim,
+ 			 int hw_prim )
+ {
+-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
++   r100ContextPtr rmesa = R100_CONTEXT(ctx);
+    GLuint se_cntl;
+    GLuint newprim = hw_prim | RADEON_CP_VC_CNTL_TCL_ENABLE;
+ 
+@@ -371,7 +370,7 @@ radeonComputeFogBlendFactor( GLcontext *ctx, GLfloat fogcoord )
+ static GLboolean radeon_run_tcl_render( GLcontext *ctx,
+ 					struct tnl_pipeline_stage *stage )
+ {
+-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
++   r100ContextPtr rmesa = R100_CONTEXT(ctx);
+    TNLcontext *tnl = TNL_CONTEXT(ctx);
+    struct vertex_buffer *VB = &tnl->vb;
+    GLuint inputs = VERT_BIT_POS | VERT_BIT_COLOR0;
+@@ -379,7 +378,7 @@ static GLboolean radeon_run_tcl_render( GLcontext *ctx,
+ 
+    /* TODO: separate this from the swtnl pipeline 
+     */
+-   if (rmesa->TclFallback)
++   if (rmesa->radeon.TclFallback)
+       return GL_TRUE;	/* fallback to software t&l */
+ 
+    if (VB->Count == 0)
+@@ -461,7 +460,7 @@ const struct tnl_pipeline_stage _radeon_tcl_stage =
+ 
+ static void transition_to_swtnl( GLcontext *ctx )
+ {
+-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
++   r100ContextPtr rmesa = R100_CONTEXT(ctx);
+    TNLcontext *tnl = TNL_CONTEXT(ctx);
+    GLuint se_cntl;
+ 
+@@ -490,7 +489,7 @@ static void transition_to_swtnl( GLcontext *ctx )
+ 
+ static void transition_to_hwtnl( GLcontext *ctx )
+ {
+-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
++   r100ContextPtr rmesa = R100_CONTEXT(ctx);
+    TNLcontext *tnl = TNL_CONTEXT(ctx);
+    GLuint se_coord_fmt = rmesa->hw.set.cmd[SET_SE_COORDFMT];
+ 
+@@ -509,15 +508,15 @@ static void transition_to_hwtnl( GLcontext *ctx )
+ 
+    tnl->Driver.NotifyMaterialChange = radeonUpdateMaterial;
+ 
+-   if ( rmesa->dma.flush )			
+-      rmesa->dma.flush( rmesa );	
++   if ( rmesa->radeon.dma.flush )			
++      rmesa->radeon.dma.flush( rmesa->radeon.glCtx );	
+ 
+-   rmesa->dma.flush = NULL;
++   rmesa->radeon.dma.flush = NULL;
+    rmesa->swtcl.vertex_format = 0;
+    
+-   if (rmesa->swtcl.indexed_verts.buf) 
+-      radeonReleaseDmaRegion( rmesa, &rmesa->swtcl.indexed_verts, 
+-			      __FUNCTION__ );
++   //   if (rmesa->swtcl.indexed_verts.buf) 
++   //      radeonReleaseDmaRegion( rmesa, &rmesa->swtcl.indexed_verts, 
++   //			      __FUNCTION__ );
+ 
+    if (RADEON_DEBUG & DEBUG_FALLBACKS) 
+       fprintf(stderr, "Radeon end tcl fallback\n");
+@@ -550,11 +549,11 @@ static char *getFallbackString(GLuint bit)
+ 
+ void radeonTclFallback( GLcontext *ctx, GLuint bit, GLboolean mode )
+ {
+-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+-   GLuint oldfallback = rmesa->TclFallback;
++   r100ContextPtr rmesa = R100_CONTEXT(ctx);
++   GLuint oldfallback = rmesa->radeon.TclFallback;
+ 
+    if (mode) {
+-      rmesa->TclFallback |= bit;
++      rmesa->radeon.TclFallback |= bit;
+       if (oldfallback == 0) {
+ 	 if (RADEON_DEBUG & DEBUG_FALLBACKS) 
+ 	    fprintf(stderr, "Radeon begin tcl fallback %s\n",
+@@ -563,7 +562,7 @@ void radeonTclFallback( GLcontext *ctx, GLuint bit, GLboolean mode )
+       }
+    }
+    else {
+-      rmesa->TclFallback &= ~bit;
++      rmesa->radeon.TclFallback &= ~bit;
+       if (oldfallback == bit) {
+ 	 if (RADEON_DEBUG & DEBUG_FALLBACKS) 
+ 	    fprintf(stderr, "Radeon end tcl fallback %s\n",
+diff --git a/src/mesa/drivers/dri/radeon/radeon_tex.c b/src/mesa/drivers/dri/radeon/radeon_tex.c
+index b0aec21..2dfb504 100644
+--- a/src/mesa/drivers/dri/radeon/radeon_tex.c
++++ b/src/mesa/drivers/dri/radeon/radeon_tex.c
+@@ -44,6 +44,7 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ #include "main/texobj.h"
+ 
+ #include "radeon_context.h"
++#include "radeon_mipmap_tree.h"
+ #include "radeon_state.h"
+ #include "radeon_ioctl.h"
+ #include "radeon_swtcl.h"
+@@ -170,10 +171,13 @@ static void radeonSetTexFilter( radeonTexObjPtr t, GLenum minf, GLenum magf )
+ {
+    GLuint anisotropy = (t->pp_txfilter & RADEON_MAX_ANISO_MASK);
+ 
++   /* Force revalidation to account for switches from/to mipmapping. */
++   t->validated = GL_FALSE;
++
+    t->pp_txfilter &= ~(RADEON_MIN_FILTER_MASK | RADEON_MAG_FILTER_MASK);
+ 
+    /* r100 chips can't handle mipmaps/aniso for cubemap/volume textures */
+-   if ( t->base.tObj->Target == GL_TEXTURE_CUBE_MAP ) {
++   if ( t->base.Target == GL_TEXTURE_CUBE_MAP ) {
+       switch ( minf ) {
+       case GL_NEAREST:
+       case GL_NEAREST_MIPMAP_NEAREST:
+@@ -244,433 +248,13 @@ static void radeonSetTexBorderColor( radeonTexObjPtr t, GLubyte c[4] )
+    t->pp_border_color = radeonPackColor( 4, c[0], c[1], c[2], c[3] );
+ }
+ 
+-
+-/**
+- * Allocate space for and load the mesa images into the texture memory block.
+- * This will happen before drawing with a new texture, or drawing with a
+- * texture after it was swapped out or teximaged again.
+- */
+-
+-static radeonTexObjPtr radeonAllocTexObj( struct gl_texture_object *texObj )
+-{
+-   radeonTexObjPtr t;
+-
+-   t = CALLOC_STRUCT( radeon_tex_obj );
+-   texObj->DriverData = t;
+-   if ( t != NULL ) {
+-      if ( RADEON_DEBUG & DEBUG_TEXTURE ) {
+-	 fprintf( stderr, "%s( %p, %p )\n", __FUNCTION__, (void *)texObj, (void *)t );
+-      }
+-
+-      /* Initialize non-image-dependent parts of the state:
+-       */
+-      t->base.tObj = texObj;
+-      t->border_fallback = GL_FALSE;
+-
+-      t->pp_txfilter = RADEON_BORDER_MODE_OGL;
+-      t->pp_txformat = (RADEON_TXFORMAT_ENDIAN_NO_SWAP |
+-			RADEON_TXFORMAT_PERSPECTIVE_ENABLE);
+-
+-      make_empty_list( & t->base );
+-
+-      radeonSetTexWrap( t, texObj->WrapS, texObj->WrapT );
+-      radeonSetTexMaxAnisotropy( t, texObj->MaxAnisotropy );
+-      radeonSetTexFilter( t, texObj->MinFilter, texObj->MagFilter );
+-      radeonSetTexBorderColor( t, texObj->_BorderChan );
+-   }
+-
+-   return t;
+-}
+-
+-
+-static const struct gl_texture_format *
+-radeonChooseTextureFormat( GLcontext *ctx, GLint internalFormat,
+-                           GLenum format, GLenum type )
+-{
+-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+-   const GLboolean do32bpt =
+-       ( rmesa->texture_depth == DRI_CONF_TEXTURE_DEPTH_32 );
+-   const GLboolean force16bpt =
+-       ( rmesa->texture_depth == DRI_CONF_TEXTURE_DEPTH_FORCE_16 );
+-   (void) format;
+-
+-   switch ( internalFormat ) {
+-   case 4:
+-   case GL_RGBA:
+-   case GL_COMPRESSED_RGBA:
+-      switch ( type ) {
+-      case GL_UNSIGNED_INT_10_10_10_2:
+-      case GL_UNSIGNED_INT_2_10_10_10_REV:
+-	 return do32bpt ? _dri_texformat_argb8888 : _dri_texformat_argb1555;
+-      case GL_UNSIGNED_SHORT_4_4_4_4:
+-      case GL_UNSIGNED_SHORT_4_4_4_4_REV:
+-	 return _dri_texformat_argb4444;
+-      case GL_UNSIGNED_SHORT_5_5_5_1:
+-      case GL_UNSIGNED_SHORT_1_5_5_5_REV:
+-	 return _dri_texformat_argb1555;
+-      default:
+-         return do32bpt ? _dri_texformat_argb8888 : _dri_texformat_argb4444;
+-      }
+-
+-   case 3:
+-   case GL_RGB:
+-   case GL_COMPRESSED_RGB:
+-      switch ( type ) {
+-      case GL_UNSIGNED_SHORT_4_4_4_4:
+-      case GL_UNSIGNED_SHORT_4_4_4_4_REV:
+-	 return _dri_texformat_argb4444;
+-      case GL_UNSIGNED_SHORT_5_5_5_1:
+-      case GL_UNSIGNED_SHORT_1_5_5_5_REV:
+-	 return _dri_texformat_argb1555;
+-      case GL_UNSIGNED_SHORT_5_6_5:
+-      case GL_UNSIGNED_SHORT_5_6_5_REV:
+-	 return _dri_texformat_rgb565;
+-      default:
+-         return do32bpt ? _dri_texformat_argb8888 : _dri_texformat_rgb565;
+-      }
+-
+-   case GL_RGBA8:
+-   case GL_RGB10_A2:
+-   case GL_RGBA12:
+-   case GL_RGBA16:
+-      return !force16bpt ?
+-	  _dri_texformat_argb8888 : _dri_texformat_argb4444;
+-
+-   case GL_RGBA4:
+-   case GL_RGBA2:
+-      return _dri_texformat_argb4444;
+-
+-   case GL_RGB5_A1:
+-      return _dri_texformat_argb1555;
+-
+-   case GL_RGB8:
+-   case GL_RGB10:
+-   case GL_RGB12:
+-   case GL_RGB16:
+-      return !force16bpt ? _dri_texformat_argb8888 : _dri_texformat_rgb565;
+-
+-   case GL_RGB5:
+-   case GL_RGB4:
+-   case GL_R3_G3_B2:
+-      return _dri_texformat_rgb565;
+-
+-   case GL_ALPHA:
+-   case GL_ALPHA4:
+-   case GL_ALPHA8:
+-   case GL_ALPHA12:
+-   case GL_ALPHA16:
+-   case GL_COMPRESSED_ALPHA:
+-      return _dri_texformat_a8;
+-
+-   case 1:
+-   case GL_LUMINANCE:
+-   case GL_LUMINANCE4:
+-   case GL_LUMINANCE8:
+-   case GL_LUMINANCE12:
+-   case GL_LUMINANCE16:
+-   case GL_COMPRESSED_LUMINANCE:
+-      return _dri_texformat_l8;
+-
+-   case 2:
+-   case GL_LUMINANCE_ALPHA:
+-   case GL_LUMINANCE4_ALPHA4:
+-   case GL_LUMINANCE6_ALPHA2:
+-   case GL_LUMINANCE8_ALPHA8:
+-   case GL_LUMINANCE12_ALPHA4:
+-   case GL_LUMINANCE12_ALPHA12:
+-   case GL_LUMINANCE16_ALPHA16:
+-   case GL_COMPRESSED_LUMINANCE_ALPHA:
+-      return _dri_texformat_al88;
+-
+-   case GL_INTENSITY:
+-   case GL_INTENSITY4:
+-   case GL_INTENSITY8:
+-   case GL_INTENSITY12:
+-   case GL_INTENSITY16:
+-   case GL_COMPRESSED_INTENSITY:
+-      return _dri_texformat_i8;
+-
+-   case GL_YCBCR_MESA:
+-      if (type == GL_UNSIGNED_SHORT_8_8_APPLE ||
+-          type == GL_UNSIGNED_BYTE)
+-         return &_mesa_texformat_ycbcr;
+-      else
+-         return &_mesa_texformat_ycbcr_rev;
+-
+-   case GL_RGB_S3TC:
+-   case GL_RGB4_S3TC:
+-   case GL_COMPRESSED_RGB_S3TC_DXT1_EXT:
+-      return &_mesa_texformat_rgb_dxt1;
+-
+-   case GL_COMPRESSED_RGBA_S3TC_DXT1_EXT:
+-      return &_mesa_texformat_rgba_dxt1;
+-
+-   case GL_RGBA_S3TC:
+-   case GL_RGBA4_S3TC:
+-   case GL_COMPRESSED_RGBA_S3TC_DXT3_EXT:
+-      return &_mesa_texformat_rgba_dxt3;
+-
+-   case GL_COMPRESSED_RGBA_S3TC_DXT5_EXT:
+-      return &_mesa_texformat_rgba_dxt5;
+-
+-   default:
+-      _mesa_problem(ctx, "unexpected texture format in %s", __FUNCTION__);
+-      return NULL;
+-   }
+-
+-   return NULL; /* never get here */
+-}
+-
+-
+-static void radeonTexImage1D( GLcontext *ctx, GLenum target, GLint level,
+-                              GLint internalFormat,
+-                              GLint width, GLint border,
+-                              GLenum format, GLenum type, const GLvoid *pixels,
+-                              const struct gl_pixelstore_attrib *packing,
+-                              struct gl_texture_object *texObj,
+-                              struct gl_texture_image *texImage )
+-{
+-   driTextureObject * t = (driTextureObject *) texObj->DriverData;
+-
+-   if ( t ) {
+-      driSwapOutTextureObject( t );
+-   }
+-   else {
+-      t = (driTextureObject *) radeonAllocTexObj( texObj );
+-      if (!t) {
+-         _mesa_error(ctx, GL_OUT_OF_MEMORY, "glTexImage1D");
+-         return;
+-      }
+-   }
+-
+-   /* Note, this will call ChooseTextureFormat */
+-   _mesa_store_teximage1d(ctx, target, level, internalFormat,
+-                          width, border, format, type, pixels,
+-                          &ctx->Unpack, texObj, texImage);
+-
+-   t->dirty_images[0] |= (1 << level);
+-}
+-
+-
+-static void radeonTexSubImage1D( GLcontext *ctx, GLenum target, GLint level,
+-                                 GLint xoffset,
+-                                 GLsizei width,
+-                                 GLenum format, GLenum type,
+-                                 const GLvoid *pixels,
+-                                 const struct gl_pixelstore_attrib *packing,
+-                                 struct gl_texture_object *texObj,
+-                                 struct gl_texture_image *texImage )
+-{
+-   driTextureObject * t = (driTextureObject *) texObj->DriverData;
+-
+-   assert( t ); /* this _should_ be true */
+-   if ( t ) {
+-      driSwapOutTextureObject( t );
+-   }
+-   else {
+-      t = (driTextureObject *) radeonAllocTexObj( texObj );
+-      if (!t) {
+-         _mesa_error(ctx, GL_OUT_OF_MEMORY, "glTexSubImage1D");
+-         return;
+-      }
+-   }
+-
+-   _mesa_store_texsubimage1d(ctx, target, level, xoffset, width,
+-			     format, type, pixels, packing, texObj,
+-			     texImage);
+-
+-   t->dirty_images[0] |= (1 << level);
+-}
+-
+-
+-static void radeonTexImage2D( GLcontext *ctx, GLenum target, GLint level,
+-                              GLint internalFormat,
+-                              GLint width, GLint height, GLint border,
+-                              GLenum format, GLenum type, const GLvoid *pixels,
+-                              const struct gl_pixelstore_attrib *packing,
+-                              struct gl_texture_object *texObj,
+-                              struct gl_texture_image *texImage )
+-{
+-   driTextureObject * t = (driTextureObject *) texObj->DriverData;
+-   GLuint face;
+-
+-   /* which cube face or ordinary 2D image */
+-   switch (target) {
+-   case GL_TEXTURE_CUBE_MAP_POSITIVE_X:
+-   case GL_TEXTURE_CUBE_MAP_NEGATIVE_X:
+-   case GL_TEXTURE_CUBE_MAP_POSITIVE_Y:
+-   case GL_TEXTURE_CUBE_MAP_NEGATIVE_Y:
+-   case GL_TEXTURE_CUBE_MAP_POSITIVE_Z:
+-   case GL_TEXTURE_CUBE_MAP_NEGATIVE_Z:
+-      face = (GLuint) target - (GLuint) GL_TEXTURE_CUBE_MAP_POSITIVE_X;
+-      ASSERT(face < 6);
+-      break;
+-   default:
+-      face = 0;
+-   }
+-
+-   if ( t != NULL ) {
+-      driSwapOutTextureObject( t );
+-   }
+-   else {
+-      t = (driTextureObject *) radeonAllocTexObj( texObj );
+-      if (!t) {
+-         _mesa_error(ctx, GL_OUT_OF_MEMORY, "glTexImage2D");
+-         return;
+-      }
+-   }
+-
+-   /* Note, this will call ChooseTextureFormat */
+-   _mesa_store_teximage2d(ctx, target, level, internalFormat,
+-                          width, height, border, format, type, pixels,
+-                          &ctx->Unpack, texObj, texImage);
+-
+-   t->dirty_images[face] |= (1 << level);
+-}
+-
+-
+-static void radeonTexSubImage2D( GLcontext *ctx, GLenum target, GLint level,
+-                                 GLint xoffset, GLint yoffset,
+-                                 GLsizei width, GLsizei height,
+-                                 GLenum format, GLenum type,
+-                                 const GLvoid *pixels,
+-                                 const struct gl_pixelstore_attrib *packing,
+-                                 struct gl_texture_object *texObj,
+-                                 struct gl_texture_image *texImage )
+-{
+-   driTextureObject * t = (driTextureObject *) texObj->DriverData;
+-   GLuint face;
+-
+-   /* which cube face or ordinary 2D image */
+-   switch (target) {
+-   case GL_TEXTURE_CUBE_MAP_POSITIVE_X:
+-   case GL_TEXTURE_CUBE_MAP_NEGATIVE_X:
+-   case GL_TEXTURE_CUBE_MAP_POSITIVE_Y:
+-   case GL_TEXTURE_CUBE_MAP_NEGATIVE_Y:
+-   case GL_TEXTURE_CUBE_MAP_POSITIVE_Z:
+-   case GL_TEXTURE_CUBE_MAP_NEGATIVE_Z:
+-      face = (GLuint) target - (GLuint) GL_TEXTURE_CUBE_MAP_POSITIVE_X;
+-      ASSERT(face < 6);
+-      break;
+-   default:
+-      face = 0;
+-   }
+-
+-   assert( t ); /* this _should_ be true */
+-   if ( t ) {
+-      driSwapOutTextureObject( t );
+-   }
+-   else {
+-      t = (driTextureObject *) radeonAllocTexObj( texObj );
+-      if (!t) {
+-         _mesa_error(ctx, GL_OUT_OF_MEMORY, "glTexSubImage2D");
+-         return;
+-      }
+-   }
+-
+-   _mesa_store_texsubimage2d(ctx, target, level, xoffset, yoffset, width,
+-			     height, format, type, pixels, packing, texObj,
+-			     texImage);
+-
+-   t->dirty_images[face] |= (1 << level);
+-}
+-
+-static void radeonCompressedTexImage2D( GLcontext *ctx, GLenum target, GLint level,
+-                              GLint internalFormat,
+-                              GLint width, GLint height, GLint border,
+-                              GLsizei imageSize, const GLvoid *data,
+-                              struct gl_texture_object *texObj,
+-                              struct gl_texture_image *texImage )
+-{
+-   driTextureObject * t = (driTextureObject *) texObj->DriverData;
+-   GLuint face;
+-
+-   /* which cube face or ordinary 2D image */
+-   switch (target) {
+-   case GL_TEXTURE_CUBE_MAP_POSITIVE_X:
+-   case GL_TEXTURE_CUBE_MAP_NEGATIVE_X:
+-   case GL_TEXTURE_CUBE_MAP_POSITIVE_Y:
+-   case GL_TEXTURE_CUBE_MAP_NEGATIVE_Y:
+-   case GL_TEXTURE_CUBE_MAP_POSITIVE_Z:
+-   case GL_TEXTURE_CUBE_MAP_NEGATIVE_Z:
+-      face = (GLuint) target - (GLuint) GL_TEXTURE_CUBE_MAP_POSITIVE_X;
+-      ASSERT(face < 6);
+-      break;
+-   default:
+-      face = 0;
+-   }
+-
+-   if ( t != NULL ) {
+-      driSwapOutTextureObject( t );
+-   }
+-   else {
+-      t = (driTextureObject *) radeonAllocTexObj( texObj );
+-      if (!t) {
+-         _mesa_error(ctx, GL_OUT_OF_MEMORY, "glCompressedTexImage2D");
+-         return;
+-      }
+-   }
+-
+-   /* Note, this will call ChooseTextureFormat */
+-   _mesa_store_compressed_teximage2d(ctx, target, level, internalFormat, width,
+-                                 height, border, imageSize, data, texObj, texImage);
+-
+-   t->dirty_images[face] |= (1 << level);
+-}
+-
+-
+-static void radeonCompressedTexSubImage2D( GLcontext *ctx, GLenum target, GLint level,
+-                                 GLint xoffset, GLint yoffset,
+-                                 GLsizei width, GLsizei height,
+-                                 GLenum format,
+-                                 GLsizei imageSize, const GLvoid *data,
+-                                 struct gl_texture_object *texObj,
+-                                 struct gl_texture_image *texImage )
+-{
+-   driTextureObject * t = (driTextureObject *) texObj->DriverData;
+-   GLuint face;
+-
+-
+-   /* which cube face or ordinary 2D image */
+-   switch (target) {
+-   case GL_TEXTURE_CUBE_MAP_POSITIVE_X:
+-   case GL_TEXTURE_CUBE_MAP_NEGATIVE_X:
+-   case GL_TEXTURE_CUBE_MAP_POSITIVE_Y:
+-   case GL_TEXTURE_CUBE_MAP_NEGATIVE_Y:
+-   case GL_TEXTURE_CUBE_MAP_POSITIVE_Z:
+-   case GL_TEXTURE_CUBE_MAP_NEGATIVE_Z:
+-      face = (GLuint) target - (GLuint) GL_TEXTURE_CUBE_MAP_POSITIVE_X;
+-      ASSERT(face < 6);
+-      break;
+-   default:
+-      face = 0;
+-   }
+-
+-   assert( t ); /* this _should_ be true */
+-   if ( t ) {
+-      driSwapOutTextureObject( t );
+-   }
+-   else {
+-      t = (driTextureObject *) radeonAllocTexObj( texObj );
+-      if (!t) {
+-         _mesa_error(ctx, GL_OUT_OF_MEMORY, "glCompressedTexSubImage2D");
+-         return;
+-      }
+-   }
+-
+-   _mesa_store_compressed_texsubimage2d(ctx, target, level, xoffset, yoffset, width,
+-                                 height, format, imageSize, data, texObj, texImage);
+-
+-   t->dirty_images[face] |= (1 << level);
+-}
+-
+ #define SCALED_FLOAT_TO_BYTE( x, scale ) \
+ 		(((GLuint)((255.0F / scale) * (x))) / 2)
+ 
+ static void radeonTexEnv( GLcontext *ctx, GLenum target,
+ 			  GLenum pname, const GLfloat *param )
+ {
+-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
++   r100ContextPtr rmesa = R100_CONTEXT(ctx);
+    GLuint unit = ctx->Texture.CurrentUnit;
+    struct gl_texture_unit *texUnit = &ctx->Texture.Unit[unit];
+ 
+@@ -701,7 +285,7 @@ static void radeonTexEnv( GLcontext *ctx, GLenum target,
+        * functions, one mapping [-1.0,0.0] to [-128,0] and one mapping
+        * [0.0,4.0] to [0,127].
+        */
+-      min = driQueryOptionb (&rmesa->optionCache, "no_neg_lod_bias") ?
++      min = driQueryOptionb (&rmesa->radeon.optionCache, "no_neg_lod_bias") ?
+ 	  0.0 : -1.0;
+       bias = CLAMP( *param, min, 4.0 );
+       if ( bias == 0 ) {
+@@ -734,7 +318,7 @@ static void radeonTexParameter( GLcontext *ctx, GLenum target,
+ 				struct gl_texture_object *texObj,
+ 				GLenum pname, const GLfloat *params )
+ {
+-   radeonTexObjPtr t = (radeonTexObjPtr) texObj->DriverData;
++   radeonTexObj* t = radeon_tex_obj(texObj);
+ 
+    if ( RADEON_DEBUG & (DEBUG_STATE|DEBUG_TEXTURE) ) {
+       fprintf( stderr, "%s( %s )\n", __FUNCTION__,
+@@ -762,57 +346,51 @@ static void radeonTexParameter( GLcontext *ctx, GLenum target,
+    case GL_TEXTURE_MAX_LEVEL:
+    case GL_TEXTURE_MIN_LOD:
+    case GL_TEXTURE_MAX_LOD:
++
+       /* This isn't the most efficient solution but there doesn't appear to
+        * be a nice alternative.  Since there's no LOD clamping,
+        * we just have to rely on loading the right subset of mipmap levels
+        * to simulate a clamped LOD.
+        */
+-      driSwapOutTextureObject( (driTextureObject *) t );
++      if (t->mt) {
++         radeon_miptree_unreference(t->mt);
++	 t->mt = 0;
++	 t->validated = GL_FALSE;
++      }
+       break;
+ 
+    default:
+       return;
+    }
+-
+-   /* Mark this texobj as dirty (one bit per tex unit)
+-    */
+-   t->dirty_state = TEX_ALL;
+-}
+-
+-
+-static void radeonBindTexture( GLcontext *ctx, GLenum target,
+-			       struct gl_texture_object *texObj )
+-{
+-   if ( RADEON_DEBUG & (DEBUG_STATE|DEBUG_TEXTURE) ) {
+-      fprintf( stderr, "%s( %p ) unit=%d\n", __FUNCTION__, (void *)texObj,
+-	       ctx->Texture.CurrentUnit );
+-   }
+-
+-   assert( (target != GL_TEXTURE_1D && target != GL_TEXTURE_2D &&
+-            target != GL_TEXTURE_RECTANGLE_NV && target != GL_TEXTURE_CUBE_MAP) ||
+-           (texObj->DriverData != NULL) );
+ }
+ 
+-
+ static void radeonDeleteTexture( GLcontext *ctx,
+ 				 struct gl_texture_object *texObj )
+ {
+-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+-   driTextureObject * t = (driTextureObject *) texObj->DriverData;
++   r100ContextPtr rmesa = R100_CONTEXT(ctx);
++   radeonTexObj* t = radeon_tex_obj(texObj);
++   int i;
+ 
+    if ( RADEON_DEBUG & (DEBUG_STATE|DEBUG_TEXTURE) ) {
+       fprintf( stderr, "%s( %p (target = %s) )\n", __FUNCTION__, (void *)texObj,
+ 	       _mesa_lookup_enum_by_nr( texObj->Target ) );
+    }
+ 
+-   if ( t != NULL ) {
+-      if ( rmesa ) {
+-         RADEON_FIREVERTICES( rmesa );
+-      }
+-
+-      driDestroyTextureObject( t );
++   if ( rmesa ) {
++     radeon_firevertices(&rmesa->radeon);
++     for ( i = 0 ; i < rmesa->radeon.glCtx->Const.MaxTextureUnits ; i++ ) {
++       if ( t == rmesa->state.texture.unit[i].texobj ) {
++	 rmesa->state.texture.unit[i].texobj = NULL;
++	 rmesa->hw.tex[i].dirty = GL_FALSE;
++	 rmesa->hw.cube[i].dirty = GL_FALSE;
++       }
++     }
+    }
+ 
++   if (t->mt) {
++      radeon_miptree_unreference(t->mt);
++      t->mt = 0;
++   }
+    /* Free mipmap images and the texture object itself */
+    _mesa_delete_texture_object(ctx, texObj);
+ }
+@@ -832,7 +410,7 @@ static void radeonTexGen( GLcontext *ctx,
+ 			  GLenum pname,
+ 			  const GLfloat *params )
+ {
+-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
++   r100ContextPtr rmesa = R100_CONTEXT(ctx);
+    GLuint unit = ctx->Texture.CurrentUnit;
+    rmesa->recheck_texgen[unit] = GL_TRUE;
+ }
+@@ -846,17 +424,27 @@ static void radeonTexGen( GLcontext *ctx,
+ static struct gl_texture_object *
+ radeonNewTextureObject( GLcontext *ctx, GLuint name, GLenum target )
+ {
+-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+-   struct gl_texture_object *obj;
+-   obj = _mesa_new_texture_object(ctx, name, target);
+-   if (!obj)
+-      return NULL;
+-   obj->MaxAnisotropy = rmesa->initialMaxAnisotropy;
+-   radeonAllocTexObj( obj );
+-   return obj;
++   r100ContextPtr rmesa = R100_CONTEXT(ctx);
++   radeonTexObj* t = CALLOC_STRUCT(radeon_tex_obj);
++
++   _mesa_initialize_texture_object(&t->base, name, target);
++   t->base.MaxAnisotropy = rmesa->radeon.initialMaxAnisotropy;
++
++   t->border_fallback = GL_FALSE;
++
++   t->pp_txfilter = RADEON_BORDER_MODE_OGL;
++   t->pp_txformat = (RADEON_TXFORMAT_ENDIAN_NO_SWAP |
++		     RADEON_TXFORMAT_PERSPECTIVE_ENABLE);
++   
++   radeonSetTexWrap( t, t->base.WrapS, t->base.WrapT );
++   radeonSetTexMaxAnisotropy( t, t->base.MaxAnisotropy );
++   radeonSetTexFilter( t, t->base.MinFilter, t->base.MagFilter );
++   radeonSetTexBorderColor( t, t->base._BorderChan );
++   return &t->base;
+ }
+ 
+ 
++
+ void radeonInitTextureFuncs( struct dd_function_table *functions )
+ {
+    functions->ChooseTextureFormat	= radeonChooseTextureFormat;
+@@ -864,11 +452,12 @@ void radeonInitTextureFuncs( struct dd_function_table *functions )
+    functions->TexImage2D		= radeonTexImage2D;
+    functions->TexSubImage1D		= radeonTexSubImage1D;
+    functions->TexSubImage2D		= radeonTexSubImage2D;
++   functions->GetTexImage               = radeonGetTexImage;
++   functions->GetCompressedTexImage     = radeonGetCompressedTexImage;
+ 
+    functions->NewTextureObject		= radeonNewTextureObject;
+-   functions->BindTexture		= radeonBindTexture;
++   //   functions->BindTexture		= radeonBindTexture;
+    functions->DeleteTexture		= radeonDeleteTexture;
+-   functions->IsTextureResident		= driIsTextureResident;
+ 
+    functions->TexEnv			= radeonTexEnv;
+    functions->TexParameter		= radeonTexParameter;
+@@ -877,5 +466,12 @@ void radeonInitTextureFuncs( struct dd_function_table *functions )
+    functions->CompressedTexImage2D	= radeonCompressedTexImage2D;
+    functions->CompressedTexSubImage2D	= radeonCompressedTexSubImage2D;
+ 
++   functions->GenerateMipmap = radeonGenerateMipmap;
++
++   functions->NewTextureImage = radeonNewTextureImage;
++   functions->FreeTexImageData = radeonFreeTexImageData;
++   functions->MapTexture = radeonMapTexture;
++   functions->UnmapTexture = radeonUnmapTexture;
++
+    driInitTextureFormats();
+ }
+diff --git a/src/mesa/drivers/dri/radeon/radeon_tex.h b/src/mesa/drivers/dri/radeon/radeon_tex.h
+index 8000880..8c2f9be 100644
+--- a/src/mesa/drivers/dri/radeon/radeon_tex.h
++++ b/src/mesa/drivers/dri/radeon/radeon_tex.h
+@@ -43,10 +43,10 @@ extern void radeonSetTexOffset(__DRIcontext *pDRICtx, GLint texname,
+ 
+ extern void radeonUpdateTextureState( GLcontext *ctx );
+ 
+-extern int radeonUploadTexImages( radeonContextPtr rmesa, radeonTexObjPtr t,
++extern int radeonUploadTexImages( r100ContextPtr rmesa, radeonTexObjPtr t,
+ 				  GLuint face );
+ 
+-extern void radeonDestroyTexObj( radeonContextPtr rmesa, radeonTexObjPtr t );
++extern void radeonDestroyTexObj( r100ContextPtr rmesa, radeonTexObjPtr t );
+ 
+ extern void radeonInitTextureFuncs( struct dd_function_table *functions );
+ 
+diff --git a/src/mesa/drivers/dri/radeon/radeon_texmem.c b/src/mesa/drivers/dri/radeon/radeon_texmem.c
+deleted file mode 100644
+index 5f7bbe6..0000000
+--- a/src/mesa/drivers/dri/radeon/radeon_texmem.c
++++ /dev/null
+@@ -1,404 +0,0 @@
+-/**************************************************************************
+-
+-Copyright 2000, 2001 ATI Technologies Inc., Ontario, Canada, and
+-                     VA Linux Systems Inc., Fremont, California.
+-
+-All Rights Reserved.
+-
+-Permission is hereby granted, free of charge, to any person obtaining
+-a copy of this software and associated documentation files (the
+-"Software"), to deal in the Software without restriction, including
+-without limitation on the rights to use, copy, modify, merge, publish,
+-distribute, sub license, and/or sell copies of the Software, and to
+-permit persons to whom the Software is furnished to do so, subject to
+-the following conditions:
+-
+-The above copyright notice and this permission notice (including the
+-next paragraph) shall be included in all copies or substantial
+-portions of the Software.
+-
+-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+-EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+-MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+-NON-INFRINGEMENT. IN NO EVENT SHALL ATI, VA LINUX SYSTEMS AND/OR THEIR
+-SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
+-IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
+-IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+-SOFTWARE.
+-
+-**************************************************************************/
+-
+-/*
+- * Authors:
+- *   Kevin E. Martin <martin@valinux.com>
+- *   Gareth Hughes <gareth@valinux.com>
+- *
+- */
+-#include <errno.h> 
+-
+-#include "main/glheader.h"
+-#include "main/imports.h"
+-#include "main/context.h"
+-#include "main/macros.h"
+-
+-#include "radeon_context.h"
+-#include "radeon_ioctl.h"
+-#include "radeon_tex.h"
+-
+-#include <unistd.h>  /* for usleep() */
+-
+-
+-/**
+- * Destroy any device-dependent state associated with the texture.  This may
+- * include NULLing out hardware state that points to the texture.
+- */
+-void
+-radeonDestroyTexObj( radeonContextPtr rmesa, radeonTexObjPtr t )
+-{
+-   if ( RADEON_DEBUG & DEBUG_TEXTURE ) {
+-      fprintf( stderr, "%s( %p, %p )\n", __FUNCTION__, (void *)t, (void *)t->base.tObj );
+-   }
+-
+-   if ( rmesa != NULL ) {
+-      unsigned   i;
+-
+-
+-      for ( i = 0 ; i < rmesa->glCtx->Const.MaxTextureUnits ; i++ ) {
+-	 if ( t == rmesa->state.texture.unit[i].texobj ) {
+-	    rmesa->state.texture.unit[i].texobj = NULL;
+-	 }
+-      }
+-   }
+-}
+-
+-
+-/* ------------------------------------------------------------
+- * Texture image conversions
+- */
+-
+-
+-static void radeonUploadRectSubImage( radeonContextPtr rmesa,
+-				      radeonTexObjPtr t, 
+-				      struct gl_texture_image *texImage,
+-				      GLint x, GLint y, 
+-				      GLint width, GLint height )
+-{
+-   const struct gl_texture_format *texFormat = texImage->TexFormat;
+-   int blit_format, dstPitch, done;
+-
+-   switch ( texFormat->TexelBytes ) {
+-   case 1:
+-      blit_format = RADEON_GMC_DST_8BPP_CI;
+-      break;
+-   case 2:
+-      blit_format = RADEON_GMC_DST_16BPP;
+-      break;
+-   case 4:
+-      blit_format = RADEON_GMC_DST_32BPP;
+-      break;
+-   default:
+-      fprintf( stderr, "radeonUploadRectSubImage: unknown blit_format (texelbytes=%d)\n", 
+-      	       texFormat->TexelBytes);
+-      return;
+-   }
+-
+-   t->image[0][0].data = texImage->Data;
+-
+-   /* Currently don't need to cope with small pitches.
+-    */
+-   width = texImage->Width;
+-   height = texImage->Height;
+-   dstPitch = t->pp_txpitch + 32;
+-
+-   {	/* FIXME: prefer GART-texturing if possible */
+-      /* Data not in GART memory, or bad pitch.
+-       */
+-      for (done = 0; done < height ; ) {
+-	 struct radeon_dma_region region;
+-	 int lines = MIN2( height - done, RADEON_BUFFER_SIZE / dstPitch );
+-	 int src_pitch;
+-	 char *tex;
+-
+-         src_pitch = texImage->RowStride * texFormat->TexelBytes;
+-
+-	 tex = (char *)texImage->Data + done * src_pitch;
+-
+-	 memset(&region, 0, sizeof(region));
+-	 radeonAllocDmaRegion( rmesa, &region, lines * dstPitch, 1024 );
+-
+-	 /* Copy texdata to dma:
+-	  */
+-	 if (0)
+-	    fprintf(stderr, "%s: src_pitch %d dst_pitch %d\n",
+-		    __FUNCTION__, src_pitch, dstPitch);
+-
+-	 if (src_pitch == dstPitch) {
+-	    memcpy( region.address + region.start, tex, lines * src_pitch );
+-	 } 
+-	 else {
+-	    char *buf = region.address + region.start;
+-	    int i;
+-	    for (i = 0 ; i < lines ; i++) {
+-	       memcpy( buf, tex, src_pitch );
+-	       buf += dstPitch;
+-	       tex += src_pitch;
+-	    }
+-	 }
+-
+-	 radeonEmitWait( rmesa, RADEON_WAIT_3D );
+-
+-	 
+-
+-	 /* Blit to framebuffer
+-	  */
+-	 radeonEmitBlit( rmesa,
+-		       blit_format,
+-		       dstPitch, GET_START( &region ),
+-		       dstPitch, t->bufAddr,
+-		       0, 0,
+-		       0, done,
+-		       width, lines );
+-	 
+-	 radeonEmitWait( rmesa, RADEON_WAIT_2D );
+-
+-	 radeonReleaseDmaRegion( rmesa, &region, __FUNCTION__ );
+-	 done += lines;
+-      }
+-   }
+-}
+-
+-
+-/**
+- * Upload the texture image associated with texture \a t at the specified
+- * level at the address relative to \a start.
+- */
+-static void uploadSubImage( radeonContextPtr rmesa, radeonTexObjPtr t, 
+-			    GLint hwlevel,
+-			    GLint x, GLint y, GLint width, GLint height,
+-			    GLuint face )
+-{
+-   struct gl_texture_image *texImage = NULL;
+-   GLuint offset;
+-   GLint imageWidth, imageHeight;
+-   GLint ret;
+-   drm_radeon_texture_t tex;
+-   drm_radeon_tex_image_t tmp;
+-   const int level = hwlevel + t->base.firstLevel;
+-
+-   if ( RADEON_DEBUG & DEBUG_TEXTURE ) {
+-      fprintf( stderr, "%s( %p, %p ) level/width/height/face = %d/%d/%d/%u\n", 
+-	       __FUNCTION__, (void *)t, (void *)t->base.tObj, level, width, height, face );
+-   }
+-
+-   ASSERT(face < 6);
+-
+-   /* Ensure we have a valid texture to upload */
+-   if ( ( hwlevel < 0 ) || ( hwlevel >= RADEON_MAX_TEXTURE_LEVELS ) ) {
+-      _mesa_problem(NULL, "bad texture level in %s", __FUNCTION__);
+-      return;
+-   }
+-
+-   texImage = t->base.tObj->Image[face][level];
+-
+-   if ( !texImage ) {
+-      if ( RADEON_DEBUG & DEBUG_TEXTURE )
+-	 fprintf( stderr, "%s: texImage %d is NULL!\n", __FUNCTION__, level );
+-      return;
+-   }
+-   if ( !texImage->Data ) {
+-      if ( RADEON_DEBUG & DEBUG_TEXTURE )
+-	 fprintf( stderr, "%s: image data is NULL!\n", __FUNCTION__ );
+-      return;
+-   }
+-
+-
+-   if (t->base.tObj->Target == GL_TEXTURE_RECTANGLE_NV) {
+-      assert(level == 0);
+-      assert(hwlevel == 0);
+-      if ( RADEON_DEBUG & DEBUG_TEXTURE )
+-	 fprintf( stderr, "%s: image data is rectangular\n", __FUNCTION__);
+-      radeonUploadRectSubImage( rmesa, t, texImage, x, y, width, height );
+-      return;
+-   }
+-
+-   imageWidth = texImage->Width;
+-   imageHeight = texImage->Height;
+-
+-   offset = t->bufAddr + t->base.totalSize * face / 6;
+-
+-   if ( RADEON_DEBUG & (DEBUG_TEXTURE|DEBUG_IOCTL) ) {
+-      GLint imageX = 0;
+-      GLint imageY = 0;
+-      GLint blitX = t->image[face][hwlevel].x;
+-      GLint blitY = t->image[face][hwlevel].y;
+-      GLint blitWidth = t->image[face][hwlevel].width;
+-      GLint blitHeight = t->image[face][hwlevel].height;
+-      fprintf( stderr, "   upload image: %d,%d at %d,%d\n",
+-	       imageWidth, imageHeight, imageX, imageY );
+-      fprintf( stderr, "   upload  blit: %d,%d at %d,%d\n",
+-	       blitWidth, blitHeight, blitX, blitY );
+-      fprintf( stderr, "       blit ofs: 0x%07x level: %d/%d\n",
+-	       (GLuint)offset, hwlevel, level );
+-   }
+-
+-   t->image[face][hwlevel].data = texImage->Data;
+-
+-   /* Init the DRM_RADEON_TEXTURE command / drm_radeon_texture_t struct.
+-    * NOTE: we're always use a 1KB-wide blit and I8 texture format.
+-    * We used to use 1, 2 and 4-byte texels and used to use the texture
+-    * width to dictate the blit width - but that won't work for compressed
+-    * textures. (Brian)
+-    * NOTE: can't do that with texture tiling. (sroland)
+-    */
+-   tex.offset = offset;
+-   tex.image = &tmp;
+-   /* copy (x,y,width,height,data) */
+-   memcpy( &tmp, &t->image[face][hwlevel], sizeof(drm_radeon_tex_image_t) );
+-
+-   if (texImage->TexFormat->TexelBytes) {
+-      /* use multi-byte upload scheme */
+-      tex.height = imageHeight;
+-      tex.width = imageWidth;
+-      tex.format = t->pp_txformat & RADEON_TXFORMAT_FORMAT_MASK;
+-      tex.pitch = MAX2((texImage->Width * texImage->TexFormat->TexelBytes) / 64, 1);
+-      tex.offset += tmp.x & ~1023;
+-      tmp.x = tmp.x % 1024;
+-      if (t->tile_bits & RADEON_TXO_MICRO_TILE_X2) {
+-	 /* need something like "tiled coordinates" ? */
+-	 tmp.y = tmp.x / (tex.pitch * 128) * 2;
+-	 tmp.x = tmp.x % (tex.pitch * 128) / 2 / texImage->TexFormat->TexelBytes;
+-	 tex.pitch |= RADEON_DST_TILE_MICRO >> 22;
+-      }
+-      else {
+-	 tmp.x = tmp.x >> (texImage->TexFormat->TexelBytes >> 1);
+-      }
+-      if ((t->tile_bits & RADEON_TXO_MACRO_TILE) &&
+-	 (texImage->Width * texImage->TexFormat->TexelBytes >= 256)) {
+-	 /* radeon switches off macro tiling for small textures/mipmaps it seems */
+-	 tex.pitch |= RADEON_DST_TILE_MACRO >> 22;
+-      }
+-   }
+-   else {
+-      /* In case of for instance 8x8 texture (2x2 dxt blocks), padding after the first two blocks is
+-         needed (only with dxt1 since 2 dxt3/dxt5 blocks already use 32 Byte). */
+-      /* set tex.height to 1/4 since 1 "macropixel" (dxt-block) has 4 real pixels. Needed
+-         so the kernel module reads the right amount of data. */
+-      tex.format = RADEON_TXFORMAT_I8; /* any 1-byte texel format */
+-      tex.pitch = (BLIT_WIDTH_BYTES / 64);
+-      tex.height = (imageHeight + 3) / 4;
+-      tex.width = (imageWidth + 3) / 4;
+-      switch (t->pp_txformat & RADEON_TXFORMAT_FORMAT_MASK) {
+-      case RADEON_TXFORMAT_DXT1:
+-         tex.width *= 8;
+-         break;
+-      case RADEON_TXFORMAT_DXT23:
+-      case RADEON_TXFORMAT_DXT45:
+-         tex.width *= 16;
+-         break;
+-      }
+-   }
+-
+-   LOCK_HARDWARE( rmesa );
+-   do {
+-      ret = drmCommandWriteRead( rmesa->dri.fd, DRM_RADEON_TEXTURE,
+-                                 &tex, sizeof(drm_radeon_texture_t) );
+-   } while ( ret == -EAGAIN );
+-
+-   UNLOCK_HARDWARE( rmesa );
+-
+-   if ( ret ) {
+-      fprintf( stderr, "DRM_RADEON_TEXTURE: return = %d\n", ret );
+-      fprintf( stderr, "   offset=0x%08x\n",
+-	       offset );
+-      fprintf( stderr, "   image width=%d height=%d\n",
+-	       imageWidth, imageHeight );
+-      fprintf( stderr, "    blit width=%d height=%d data=%p\n",
+-	       t->image[face][hwlevel].width, t->image[face][hwlevel].height,
+-	       t->image[face][hwlevel].data );
+-      exit( 1 );
+-   }
+-}
+-
+-
+-/**
+- * Upload the texture images associated with texture \a t.  This might
+- * require the allocation of texture memory.
+- * 
+- * \param rmesa Context pointer
+- * \param t Texture to be uploaded
+- * \param face Cube map face to be uploaded.  Zero for non-cube maps.
+- */
+-
+-int radeonUploadTexImages( radeonContextPtr rmesa, radeonTexObjPtr t, GLuint face )
+-{
+-   int numLevels;
+-
+-   if ( !t || t->base.totalSize == 0 || t->image_override )
+-      return 0;
+-
+-   if ( RADEON_DEBUG & (DEBUG_TEXTURE|DEBUG_IOCTL) ) {
+-      fprintf( stderr, "%s( %p, %p ) sz=%d lvls=%d-%d\n", __FUNCTION__,
+-	       (void *)rmesa->glCtx, (void *)t->base.tObj, t->base.totalSize,
+-	       t->base.firstLevel, t->base.lastLevel );
+-   }
+-
+-   numLevels = t->base.lastLevel - t->base.firstLevel + 1;
+-
+-   if (RADEON_DEBUG & DEBUG_SYNC) {
+-      fprintf(stderr, "%s: Syncing\n", __FUNCTION__ );
+-      radeonFinish( rmesa->glCtx );
+-   }
+-
+-   LOCK_HARDWARE( rmesa );
+-
+-   if ( t->base.memBlock == NULL ) {
+-      int heap;
+-
+-      heap = driAllocateTexture( rmesa->texture_heaps, rmesa->nr_heaps,
+-				 (driTextureObject *) t );
+-      if ( heap == -1 ) {
+-	 UNLOCK_HARDWARE( rmesa );
+-	 return -1;
+-      }
+-
+-      /* Set the base offset of the texture image */
+-      t->bufAddr = rmesa->radeonScreen->texOffset[heap] 
+-	   + t->base.memBlock->ofs;
+-      t->pp_txoffset = t->bufAddr;
+-
+-      if (!(t->base.tObj->Image[0][0]->IsClientData)) {
+-	 /* hope it's safe to add that here... */
+-	 t->pp_txoffset |= t->tile_bits;
+-      }
+-
+-      /* Mark this texobj as dirty on all units:
+-       */
+-      t->dirty_state = TEX_ALL;
+-   }
+-
+-
+-   /* Let the world know we've used this memory recently.
+-    */
+-   driUpdateTextureLRU( (driTextureObject *) t );
+-   UNLOCK_HARDWARE( rmesa );
+-
+-
+-   /* Upload any images that are new */
+-   if (t->base.dirty_images[face]) {
+-      int i;
+-      for ( i = 0 ; i < numLevels ; i++ ) {
+-         if ( (t->base.dirty_images[face] & (1 << (i+t->base.firstLevel))) != 0 ) {
+-            uploadSubImage( rmesa, t, i, 0, 0, t->image[face][i].width,
+-			    t->image[face][i].height, face );
+-         }
+-      }
+-      t->base.dirty_images[face] = 0;
+-   }
+-
+-   if (RADEON_DEBUG & DEBUG_SYNC) {
+-      fprintf(stderr, "%s: Syncing\n", __FUNCTION__ );
+-      radeonFinish( rmesa->glCtx );
+-   }
+-
+-   return 0;
+-}
+diff --git a/src/mesa/drivers/dri/radeon/radeon_texstate.c b/src/mesa/drivers/dri/radeon/radeon_texstate.c
+index 1e2f654..6a34f1e 100644
+--- a/src/mesa/drivers/dri/radeon/radeon_texstate.c
++++ b/src/mesa/drivers/dri/radeon/radeon_texstate.c
+@@ -43,6 +43,7 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ #include "main/enums.h"
+ 
+ #include "radeon_context.h"
++#include "radeon_mipmap_tree.h"
+ #include "radeon_state.h"
+ #include "radeon_ioctl.h"
+ #include "radeon_swtcl.h"
+@@ -75,10 +76,11 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ #define VALID_FORMAT(f) ( ((f) <= MESA_FORMAT_RGBA_DXT5) \
+ 			     && (tx_table[f].format != 0xffffffff) )
+ 
+-static const struct {
++struct tx_table {
+    GLuint format, filter;
+-}
+-tx_table[] =
++};
 +
-+	GLuint bpp; /** Bytes per texel */
-+	GLuint tilebits; /** RADEON_TXO_xxx_TILE */
-+	GLuint compressed; /** MESA_FORMAT_xxx indicating a compressed format, or 0 if uncompressed */
++static const struct tx_table tx_table[] =
+ {
+    _ALPHA(RGBA8888),
+    _ALPHA_REV(RGBA8888),
+@@ -111,252 +113,6 @@ tx_table[] =
+ #undef _ALPHA
+ #undef _INVALID
+ 
+-/**
+- * This function computes the number of bytes of storage needed for
+- * the given texture object (all mipmap levels, all cube faces).
+- * The \c image[face][level].x/y/width/height parameters for upload/blitting
+- * are computed here.  \c pp_txfilter, \c pp_txformat, etc. will be set here
+- * too.
+- * 
+- * \param rmesa Context pointer
+- * \param tObj GL texture object whose images are to be posted to
+- *                 hardware state.
+- */
+-static void radeonSetTexImages( radeonContextPtr rmesa,
+-				struct gl_texture_object *tObj )
+-{
+-   radeonTexObjPtr t = (radeonTexObjPtr)tObj->DriverData;
+-   const struct gl_texture_image *baseImage = tObj->Image[0][tObj->BaseLevel];
+-   GLint curOffset, blitWidth;
+-   GLint i, texelBytes;
+-   GLint numLevels;
+-   GLint log2Width, log2Height, log2Depth;
+-
+-   /* Set the hardware texture format
+-    */
+-   if ( !t->image_override ) {
+-      t->pp_txformat &= ~(RADEON_TXFORMAT_FORMAT_MASK |
+-                          RADEON_TXFORMAT_ALPHA_IN_MAP);
+-      t->pp_txfilter &= ~RADEON_YUV_TO_RGB;
+-
+-      if ( VALID_FORMAT( baseImage->TexFormat->MesaFormat ) ) {
+-         t->pp_txformat |= tx_table[ baseImage->TexFormat->MesaFormat ].format;
+-         t->pp_txfilter |= tx_table[ baseImage->TexFormat->MesaFormat ].filter;
+-      }
+-      else {
+-         _mesa_problem(NULL, "unexpected texture format in %s", __FUNCTION__);
+-         return;
+-      }
+-   }
+-
+-   texelBytes = baseImage->TexFormat->TexelBytes;
+-
+-   /* Compute which mipmap levels we really want to send to the hardware.
+-    */
+-
+-   if (tObj->Target != GL_TEXTURE_CUBE_MAP)
+-      driCalculateTextureFirstLastLevel( (driTextureObject *) t );
+-   else {
+-      /* r100 can't handle mipmaps for cube/3d textures, so don't waste
+-         memory for them */
+-      t->base.firstLevel = t->base.lastLevel = tObj->BaseLevel;
+-   }
+-   log2Width  = tObj->Image[0][t->base.firstLevel]->WidthLog2;
+-   log2Height = tObj->Image[0][t->base.firstLevel]->HeightLog2;
+-   log2Depth  = tObj->Image[0][t->base.firstLevel]->DepthLog2;
+-
+-   numLevels = t->base.lastLevel - t->base.firstLevel + 1;
+-
+-   assert(numLevels <= RADEON_MAX_TEXTURE_LEVELS);
+-
+-   /* Calculate mipmap offsets and dimensions for blitting (uploading)
+-    * The idea is that we lay out the mipmap levels within a block of
+-    * memory organized as a rectangle of width BLIT_WIDTH_BYTES.
+-    */
+-   curOffset = 0;
+-   blitWidth = BLIT_WIDTH_BYTES;
+-   t->tile_bits = 0;
+-
+-   /* figure out if this texture is suitable for tiling. */
+-   if (texelBytes && (tObj->Target != GL_TEXTURE_RECTANGLE_NV)) {
+-      if (rmesa->texmicrotile && (baseImage->Height > 1)) {
+-	 /* allow 32 (bytes) x 1 mip (which will use two times the space
+-	    the non-tiled version would use) max if base texture is large enough */
+-	 if ((numLevels == 1) ||
+-	   (((baseImage->Width * texelBytes / baseImage->Height) <= 32) &&
+-	       (baseImage->Width * texelBytes > 64)) ||
+-	    ((baseImage->Width * texelBytes / baseImage->Height) <= 16)) {
+-	    /* R100 has two microtile bits (only the txoffset reg, not the blitter)
+-	       weird: X2 + OPT: 32bit correct, 16bit completely hosed
+-		      X2: 32bit correct, 16bit correct
+-		      OPT: 32bit large mips correct, small mips hosed, 16bit completely hosed */
+-	    t->tile_bits |= RADEON_TXO_MICRO_TILE_X2 /*| RADEON_TXO_MICRO_TILE_OPT*/;
+-	 }
+-      }
+-      if ((baseImage->Width * texelBytes >= 256) && (baseImage->Height >= 16)) {
+-	 /* R100 disables macro tiling only if mip width is smaller than 256 bytes, and not
+-	    in the case if height is smaller than 16 (not 100% sure), as does the r200,
+-	    so need to disable macro tiling in that case */
+-	 if ((numLevels == 1) || ((baseImage->Width * texelBytes / baseImage->Height) <= 4)) {
+-	    t->tile_bits |= RADEON_TXO_MACRO_TILE;
+-	 }
+-      }
+-   }
+-
+-   for (i = 0; i < numLevels; i++) {
+-      const struct gl_texture_image *texImage;
+-      GLuint size;
+-
+-      texImage = tObj->Image[0][i + t->base.firstLevel];
+-      if ( !texImage )
+-	 break;
+-
+-      /* find image size in bytes */
+-      if (texImage->IsCompressed) {
+-      /* need to calculate the size AFTER padding even though the texture is
+-         submitted without padding.
+-         Only handle pot textures currently - don't know if npot is even possible,
+-         size calculation would certainly need (trivial) adjustments.
+-         Align (and later pad) to 32byte, not sure what that 64byte blit width is
+-         good for? */
+-         if ((t->pp_txformat & RADEON_TXFORMAT_FORMAT_MASK) == RADEON_TXFORMAT_DXT1) {
+-            /* RGB_DXT1/RGBA_DXT1, 8 bytes per block */
+-            if ((texImage->Width + 3) < 8) /* width one block */
+-               size = texImage->CompressedSize * 4;
+-            else if ((texImage->Width + 3) < 16)
+-               size = texImage->CompressedSize * 2;
+-            else size = texImage->CompressedSize;
+-         }
+-         else /* DXT3/5, 16 bytes per block */
+-            if ((texImage->Width + 3) < 8)
+-               size = texImage->CompressedSize * 2;
+-            else size = texImage->CompressedSize;
+-      }
+-      else if (tObj->Target == GL_TEXTURE_RECTANGLE_NV) {
+-	 size = ((texImage->Width * texelBytes + 63) & ~63) * texImage->Height;
+-      }
+-      else if (t->tile_bits & RADEON_TXO_MICRO_TILE_X2) {
+-	 /* tile pattern is 16 bytes x2. mipmaps stay 32 byte aligned,
+-	    though the actual offset may be different (if texture is less than
+-	    32 bytes width) to the untiled case */
+-	 int w = (texImage->Width * texelBytes * 2 + 31) & ~31;
+-	 size = (w * ((texImage->Height + 1) / 2)) * texImage->Depth;
+-	 blitWidth = MAX2(texImage->Width, 64 / texelBytes);
+-      }
+-      else {
+-	 int w = (texImage->Width * texelBytes + 31) & ~31;
+-	 size = w * texImage->Height * texImage->Depth;
+-	 blitWidth = MAX2(texImage->Width, 64 / texelBytes);
+-      }
+-      assert(size > 0);
+-
+-      /* Align to 32-byte offset.  It is faster to do this unconditionally
+-       * (no branch penalty).
+-       */
+-
+-      curOffset = (curOffset + 0x1f) & ~0x1f;
+-
+-      if (texelBytes) {
+-	 t->image[0][i].x = curOffset; /* fix x and y coords up later together with offset */
+-	 t->image[0][i].y = 0;
+-	 t->image[0][i].width = MIN2(size / texelBytes, blitWidth);
+-	 t->image[0][i].height = (size / texelBytes) / t->image[0][i].width;
+-      }
+-      else {
+-         t->image[0][i].x = curOffset % BLIT_WIDTH_BYTES;
+-         t->image[0][i].y = curOffset / BLIT_WIDTH_BYTES;
+-         t->image[0][i].width  = MIN2(size, BLIT_WIDTH_BYTES);
+-         t->image[0][i].height = size / t->image[0][i].width;     
+-      }
+-
+-#if 0
+-      /* for debugging only and only  applicable to non-rectangle targets */
+-      assert(size % t->image[0][i].width == 0);
+-      assert(t->image[0][i].x == 0
+-             || (size < BLIT_WIDTH_BYTES && t->image[0][i].height == 1));
+-#endif
+-
+-      if (0)
+-         fprintf(stderr,
+-                 "level %d: %dx%d x=%d y=%d w=%d h=%d size=%d at %d\n",
+-                 i, texImage->Width, texImage->Height,
+-                 t->image[0][i].x, t->image[0][i].y,
+-                 t->image[0][i].width, t->image[0][i].height, size, curOffset);
+-
+-      curOffset += size;
+-
+-   }
+-
+-   /* Align the total size of texture memory block.
+-    */
+-   t->base.totalSize = (curOffset + RADEON_OFFSET_MASK) & ~RADEON_OFFSET_MASK;
+-
+-   /* Setup remaining cube face blits, if needed */
+-   if (tObj->Target == GL_TEXTURE_CUBE_MAP) {
+-      const GLuint faceSize = t->base.totalSize;
+-      GLuint face;
+-      /* reuse face 0 x/y/width/height - just update the offset when uploading */
+-      for (face = 1; face < 6; face++) {
+-         for (i = 0; i < numLevels; i++) {
+-            t->image[face][i].x =  t->image[0][i].x;
+-            t->image[face][i].y =  t->image[0][i].y;
+-            t->image[face][i].width  = t->image[0][i].width;
+-            t->image[face][i].height = t->image[0][i].height;
+-         }
+-      }
+-      t->base.totalSize = 6 * faceSize; /* total texmem needed */
+-   }
+-
+-   /* Hardware state:
+-    */
+-   t->pp_txfilter &= ~RADEON_MAX_MIP_LEVEL_MASK;
+-   t->pp_txfilter |= (numLevels - 1) << RADEON_MAX_MIP_LEVEL_SHIFT;
+-
+-   t->pp_txformat &= ~(RADEON_TXFORMAT_WIDTH_MASK |
+-		       RADEON_TXFORMAT_HEIGHT_MASK |
+-                       RADEON_TXFORMAT_CUBIC_MAP_ENABLE |
+-                       RADEON_TXFORMAT_F5_WIDTH_MASK |
+-                       RADEON_TXFORMAT_F5_HEIGHT_MASK);
+-   t->pp_txformat |= ((log2Width << RADEON_TXFORMAT_WIDTH_SHIFT) |
+-		      (log2Height << RADEON_TXFORMAT_HEIGHT_SHIFT));
+-
+-   if (tObj->Target == GL_TEXTURE_CUBE_MAP) {
+-      assert(log2Width == log2Height);
+-      t->pp_txformat |= ((log2Width << RADEON_TXFORMAT_F5_WIDTH_SHIFT) |
+-                         (log2Height << RADEON_TXFORMAT_F5_HEIGHT_SHIFT) |
+-                         (RADEON_TXFORMAT_CUBIC_MAP_ENABLE));
+-      t->pp_cubic_faces = ((log2Width << RADEON_FACE_WIDTH_1_SHIFT) |
+-                           (log2Height << RADEON_FACE_HEIGHT_1_SHIFT) |
+-                           (log2Width << RADEON_FACE_WIDTH_2_SHIFT) |
+-                           (log2Height << RADEON_FACE_HEIGHT_2_SHIFT) |
+-                           (log2Width << RADEON_FACE_WIDTH_3_SHIFT) |
+-                           (log2Height << RADEON_FACE_HEIGHT_3_SHIFT) |
+-                           (log2Width << RADEON_FACE_WIDTH_4_SHIFT) |
+-                           (log2Height << RADEON_FACE_HEIGHT_4_SHIFT));
+-   }
+-
+-   t->pp_txsize = (((tObj->Image[0][t->base.firstLevel]->Width - 1) << 0) |
+-                   ((tObj->Image[0][t->base.firstLevel]->Height - 1) << 16));
+-
+-   /* Only need to round to nearest 32 for textures, but the blitter
+-    * requires 64-byte aligned pitches, and we may/may not need the
+-    * blitter.   NPOT only!
+-    */
+-   if ( !t->image_override ) {
+-      if (baseImage->IsCompressed)
+-         t->pp_txpitch = (tObj->Image[0][t->base.firstLevel]->Width + 63) & ~(63);
+-      else
+-         t->pp_txpitch = ((tObj->Image[0][t->base.firstLevel]->Width * texelBytes) + 63) & ~(63);
+-      t->pp_txpitch -= 32;
+-   }
+-
+-   t->dirty_state = TEX_ALL;
+-
+-   /* FYI: radeonUploadTexImages( rmesa, t ); used to be called here */
+-}
+-
+-
+-
+ /* ================================================================
+  * Texture combine functions
+  */
+@@ -503,7 +259,7 @@ do {							\
+ 
+ static GLboolean radeonUpdateTextureEnv( GLcontext *ctx, int unit )
+ {
+-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
++   r100ContextPtr rmesa = R100_CONTEXT(ctx);
+    const struct gl_texture_unit *texUnit = &ctx->Texture.Unit[unit];
+    GLuint color_combine, alpha_combine;
+    const GLuint color_combine0 = RADEON_COLOR_ARG_A_ZERO | RADEON_COLOR_ARG_B_ZERO
+@@ -846,22 +602,21 @@ static GLboolean radeonUpdateTextureEnv( GLcontext *ctx, int unit )
+ void radeonSetTexOffset(__DRIcontext * pDRICtx, GLint texname,
+                         unsigned long long offset, GLint depth, GLuint pitch)
+ {
+-	radeonContextPtr rmesa = pDRICtx->driverPrivate;
++	r100ContextPtr rmesa = pDRICtx->driverPrivate;
+ 	struct gl_texture_object *tObj =
+-	    _mesa_lookup_texture(rmesa->glCtx, texname);
+-	radeonTexObjPtr t;
++	    _mesa_lookup_texture(rmesa->radeon.glCtx, texname);
++	radeonTexObjPtr t = radeon_tex_obj(tObj);
+ 
+ 	if (tObj == NULL)
+ 		return;
+ 
+-	t = (radeonTexObjPtr) tObj->DriverData;
+-
+ 	t->image_override = GL_TRUE;
+ 
+ 	if (!offset)
+ 		return;
+-
+-	t->pp_txoffset = offset;
++	
++	t->bo = NULL;
++	t->override_offset = offset;
+ 	t->pp_txpitch = pitch - 32;
+ 
+ 	switch (depth) {
+@@ -901,12 +656,58 @@ void radeonSetTexOffset(__DRIcontext * pDRICtx, GLint texname,
+                               RADEON_TXFORMAT_NON_POWER2)
+ 
+ 
+-static void import_tex_obj_state( radeonContextPtr rmesa,
++static void disable_tex_obj_state( r100ContextPtr rmesa, 
++				   int unit )
++{
++   /* do not use RADEON_DB_STATE to avoid stale texture caches */
++   uint32_t *cmd = &rmesa->hw.tex[unit].cmd[TEX_CMD_0];
++   GLuint se_coord_fmt = rmesa->hw.set.cmd[SET_SE_COORDFMT];
++   GLuint *txr_cmd = RADEON_DB_STATE( txr[unit] );
 +
-+	radeon_mipmap_level levels[RADEON_MAX_TEXTURE_LEVELS];
-+};
++   RADEON_STATECHANGE( rmesa, tex[unit] );
 +
-+radeon_mipmap_tree* radeon_miptree_create(radeonContextPtr rmesa, radeonTexObj *t,
-+		GLenum target, GLuint firstLevel, GLuint lastLevel,
-+		GLuint width0, GLuint height0, GLuint depth0,
-+		GLuint bpp, GLuint tilebits, GLuint compressed);
-+void radeon_miptree_reference(radeon_mipmap_tree *mt);
-+void radeon_miptree_unreference(radeon_mipmap_tree *mt);
++   RADEON_STATECHANGE( rmesa, tcl );
++   rmesa->hw.tcl.cmd[TCL_OUTPUT_VTXFMT] &= ~(RADEON_ST_BIT(unit) |
++					     RADEON_Q_BIT(unit));
++   
++   if (rmesa->radeon.TclFallback & (RADEON_TCL_FALLBACK_TEXGEN_0<<unit)) {
++     TCL_FALLBACK( rmesa->radeon.glCtx, (RADEON_TCL_FALLBACK_TEXGEN_0<<unit), GL_FALSE);
++     rmesa->recheck_texgen[unit] = GL_TRUE;
++   }
 +
-+GLboolean radeon_miptree_matches_image(radeon_mipmap_tree *mt,
-+		struct gl_texture_image *texImage, GLuint face, GLuint level);
-+GLboolean radeon_miptree_matches_texture(radeon_mipmap_tree *mt, struct gl_texture_object *texObj);
-+void radeon_try_alloc_miptree(radeonContextPtr rmesa, radeonTexObj *t,
-+			      struct gl_texture_image *texImage, GLuint face, GLuint level);
++   if (rmesa->hw.tex[unit].cmd[TEX_PP_TXFORMAT] & RADEON_TXFORMAT_CUBIC_MAP_ENABLE) {
++     /* this seems to be a genuine (r100 only?) hw bug. Need to remove the
++	cubic_map bit on unit 2 when the unit is disabled, otherwise every
++	2nd (2d) mipmap on unit 0 will be broken (may not be needed for other
++	units, better be safe than sorry though).*/
++     RADEON_STATECHANGE( rmesa, tex[unit] );
++     rmesa->hw.tex[unit].cmd[TEX_PP_TXFORMAT] &= ~RADEON_TXFORMAT_CUBIC_MAP_ENABLE;
++   }
 +
++   {
++      GLuint inputshift = RADEON_TEXGEN_0_INPUT_SHIFT + unit*4;
++      GLuint tmp = rmesa->TexGenEnabled;
 +
-+#endif /* __RADEON_MIPMAP_TREE_H_ */
++      rmesa->TexGenEnabled &= ~(RADEON_TEXGEN_TEXMAT_0_ENABLE<<unit);
++      rmesa->TexGenEnabled &= ~(RADEON_TEXMAT_0_ENABLE<<unit);
++      rmesa->TexGenEnabled &= ~(RADEON_TEXGEN_INPUT_MASK<<inputshift);
++      rmesa->TexGenNeedNormals[unit] = 0;
++      rmesa->TexGenEnabled |= 
++	(RADEON_TEXGEN_INPUT_TEXCOORD_0+unit) << inputshift;
++
++      if (tmp != rmesa->TexGenEnabled) {
++	rmesa->recheck_texgen[unit] = GL_TRUE;
++	rmesa->radeon.NewGLState |= _NEW_TEXTURE_MATRIX;
++      }
++   }
++}
++
++static void import_tex_obj_state( r100ContextPtr rmesa,
+ 				  int unit,
+ 				  radeonTexObjPtr texobj )
+ {
+ /* do not use RADEON_DB_STATE to avoid stale texture caches */
+-   int *cmd = &rmesa->hw.tex[unit].cmd[TEX_CMD_0];
++   uint32_t *cmd = &rmesa->hw.tex[unit].cmd[TEX_CMD_0];
+    GLuint se_coord_fmt = rmesa->hw.set.cmd[SET_SE_COORDFMT];
+ 
+    RADEON_STATECHANGE( rmesa, tex[unit] );
+@@ -915,10 +716,9 @@ static void import_tex_obj_state( radeonContextPtr rmesa,
+    cmd[TEX_PP_TXFILTER] |= texobj->pp_txfilter & TEXOBJ_TXFILTER_MASK;
+    cmd[TEX_PP_TXFORMAT] &= ~TEXOBJ_TXFORMAT_MASK;
+    cmd[TEX_PP_TXFORMAT] |= texobj->pp_txformat & TEXOBJ_TXFORMAT_MASK;
+-   cmd[TEX_PP_TXOFFSET] = texobj->pp_txoffset;
+    cmd[TEX_PP_BORDER_COLOR] = texobj->pp_border_color;
+ 
+-   if (texobj->base.tObj->Target == GL_TEXTURE_RECTANGLE_NV) {
++   if (texobj->base.Target == GL_TEXTURE_RECTANGLE_NV) {
+       GLuint *txr_cmd = RADEON_DB_STATE( txr[unit] );
+       txr_cmd[TXR_PP_TEX_SIZE] = texobj->pp_txsize; /* NPOT only! */
+       txr_cmd[TXR_PP_TEX_PITCH] = texobj->pp_txpitch; /* NPOT only! */
+@@ -928,22 +728,12 @@ static void import_tex_obj_state( radeonContextPtr rmesa,
+    else {
+       se_coord_fmt &= ~(RADEON_VTX_ST0_NONPARAMETRIC << unit);
+ 
+-      if (texobj->base.tObj->Target == GL_TEXTURE_CUBE_MAP) {
+-	 int *cube_cmd = &rmesa->hw.cube[unit].cmd[CUBE_CMD_0];
+-	 GLuint bytesPerFace = texobj->base.totalSize / 6;
+-	 ASSERT(texobj->base.totalSize % 6 == 0);
++      if (texobj->base.Target == GL_TEXTURE_CUBE_MAP) {
++	 uint32_t *cube_cmd = &rmesa->hw.cube[unit].cmd[CUBE_CMD_0];
+ 
+ 	 RADEON_STATECHANGE( rmesa, cube[unit] );
+ 	 cube_cmd[CUBE_PP_CUBIC_FACES] = texobj->pp_cubic_faces;
+-	 /* dont know if this setup conforms to OpenGL.. 
+-	  * at least it matches the behavior of mesa software renderer
+-	  */
+-	 cube_cmd[CUBE_PP_CUBIC_OFFSET_0] = texobj->pp_txoffset; /* right */
+-	 cube_cmd[CUBE_PP_CUBIC_OFFSET_1] = texobj->pp_txoffset + 1 * bytesPerFace; /* left */
+-	 cube_cmd[CUBE_PP_CUBIC_OFFSET_2] = texobj->pp_txoffset + 2 * bytesPerFace; /* top */
+-	 cube_cmd[CUBE_PP_CUBIC_OFFSET_3] = texobj->pp_txoffset + 3 * bytesPerFace; /* bottom */
+-	 cube_cmd[CUBE_PP_CUBIC_OFFSET_4] = texobj->pp_txoffset + 4 * bytesPerFace; /* front */
+-	 cmd[TEX_PP_TXOFFSET] = texobj->pp_txoffset + 5 * bytesPerFace; /* back */
++	 /* state filled out in the cube_emit */
+       }
+    }
+ 
+@@ -952,13 +742,11 @@ static void import_tex_obj_state( radeonContextPtr rmesa,
+       rmesa->hw.set.cmd[SET_SE_COORDFMT] = se_coord_fmt;
+    }
+ 
+-   texobj->dirty_state &= ~(1<<unit);
++   rmesa->radeon.NewGLState |= _NEW_TEXTURE_MATRIX;
+ }
+ 
+ 
+-
+-
+-static void set_texgen_matrix( radeonContextPtr rmesa, 
++static void set_texgen_matrix( r100ContextPtr rmesa, 
+ 			       GLuint unit,
+ 			       const GLfloat *s_plane,
+ 			       const GLfloat *t_plane,
+@@ -986,14 +774,14 @@ static void set_texgen_matrix( radeonContextPtr rmesa,
+    rmesa->TexGenMatrix[unit].m[15] = q_plane[3];
+ 
+    rmesa->TexGenEnabled |= RADEON_TEXMAT_0_ENABLE << unit;
+-   rmesa->NewGLState |= _NEW_TEXTURE_MATRIX;
++   rmesa->radeon.NewGLState |= _NEW_TEXTURE_MATRIX;
+ }
+ 
+ /* Returns GL_FALSE if fallback required.
+  */
+ static GLboolean radeon_validate_texgen( GLcontext *ctx, GLuint unit )
+ {
+-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
++   r100ContextPtr rmesa = R100_CONTEXT(ctx);
+    struct gl_texture_unit *texUnit = &ctx->Texture.Unit[unit];
+    GLuint inputshift = RADEON_TEXGEN_0_INPUT_SHIFT + unit*4;
+    GLuint tmp = rmesa->TexGenEnabled;
+@@ -1094,283 +882,185 @@ static GLboolean radeon_validate_texgen( GLcontext *ctx, GLuint unit )
+    }
+ 
+    if (tmp != rmesa->TexGenEnabled) {
+-      rmesa->NewGLState |= _NEW_TEXTURE_MATRIX;
++      rmesa->radeon.NewGLState |= _NEW_TEXTURE_MATRIX;
+    }
+ 
+    return GL_TRUE;
+ }
+ 
+-
+-static void disable_tex( GLcontext *ctx, int unit )
+-{
+-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+-
+-   if (rmesa->hw.ctx.cmd[CTX_PP_CNTL] & (RADEON_TEX_0_ENABLE<<unit)) {
+-      /* Texture unit disabled */
+-      if ( rmesa->state.texture.unit[unit].texobj != NULL ) {
+-	 /* The old texture is no longer bound to this texture unit.
+-	  * Mark it as such.
+-	  */
+-
+-	 rmesa->state.texture.unit[unit].texobj->base.bound &= ~(1UL << unit);
+-	 rmesa->state.texture.unit[unit].texobj = NULL;
+-      }
+-
+-      RADEON_STATECHANGE( rmesa, ctx );
+-      rmesa->hw.ctx.cmd[CTX_PP_CNTL] &= 
+-	  ~((RADEON_TEX_0_ENABLE | RADEON_TEX_BLEND_0_ENABLE) << unit);
+-
+-      RADEON_STATECHANGE( rmesa, tcl );
+-      rmesa->hw.tcl.cmd[TCL_OUTPUT_VTXFMT] &= ~(RADEON_ST_BIT(unit) |
+-						RADEON_Q_BIT(unit));
+-
+-      if (rmesa->TclFallback & (RADEON_TCL_FALLBACK_TEXGEN_0<<unit)) {
+-	 TCL_FALLBACK( ctx, (RADEON_TCL_FALLBACK_TEXGEN_0<<unit), GL_FALSE);
+-	 rmesa->recheck_texgen[unit] = GL_TRUE;
+-      }
+-
+-      if (rmesa->hw.tex[unit].cmd[TEX_PP_TXFORMAT] & RADEON_TXFORMAT_CUBIC_MAP_ENABLE) {
+-      /* this seems to be a genuine (r100 only?) hw bug. Need to remove the
+-         cubic_map bit on unit 2 when the unit is disabled, otherwise every
+-	 2nd (2d) mipmap on unit 0 will be broken (may not be needed for other
+-	 units, better be safe than sorry though).*/
+-	 RADEON_STATECHANGE( rmesa, tex[unit] );
+-	 rmesa->hw.tex[unit].cmd[TEX_PP_TXFORMAT] &= ~RADEON_TXFORMAT_CUBIC_MAP_ENABLE;
+-      }
+-
+-      {
+-	 GLuint inputshift = RADEON_TEXGEN_0_INPUT_SHIFT + unit*4;
+-	 GLuint tmp = rmesa->TexGenEnabled;
+-
+-	 rmesa->TexGenEnabled &= ~(RADEON_TEXGEN_TEXMAT_0_ENABLE<<unit);
+-	 rmesa->TexGenEnabled &= ~(RADEON_TEXMAT_0_ENABLE<<unit);
+-	 rmesa->TexGenEnabled &= ~(RADEON_TEXGEN_INPUT_MASK<<inputshift);
+-	 rmesa->TexGenNeedNormals[unit] = 0;
+-	 rmesa->TexGenEnabled |= 
+-	     (RADEON_TEXGEN_INPUT_TEXCOORD_0+unit) << inputshift;
+-
+-	 if (tmp != rmesa->TexGenEnabled) {
+-	    rmesa->recheck_texgen[unit] = GL_TRUE;
+-	    rmesa->NewGLState |= _NEW_TEXTURE_MATRIX;
+-	 }
+-      }
+-   }
+-}
+-
+-static GLboolean enable_tex_2d( GLcontext *ctx, int unit )
+-{
+-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+-   struct gl_texture_unit *texUnit = &ctx->Texture.Unit[unit];
+-   struct gl_texture_object *tObj = texUnit->_Current;
+-   radeonTexObjPtr t = (radeonTexObjPtr) tObj->DriverData;
+-
+-   /* Need to load the 2d images associated with this unit.
+-    */
+-   if (t->pp_txformat & RADEON_TXFORMAT_NON_POWER2) {
+-      t->pp_txformat &= ~RADEON_TXFORMAT_NON_POWER2;
+-      t->base.dirty_images[0] = ~0;
+-   }
+-
+-   ASSERT(tObj->Target == GL_TEXTURE_2D || tObj->Target == GL_TEXTURE_1D);
+-
+-   if ( t->base.dirty_images[0] ) {
+-      RADEON_FIREVERTICES( rmesa );
+-      radeonSetTexImages( rmesa, tObj );
+-      radeonUploadTexImages( rmesa, (radeonTexObjPtr) tObj->DriverData, 0 );
+-      if ( !t->base.memBlock && !t->image_override ) 
+-	return GL_FALSE;
+-   }
+-
+-   return GL_TRUE;
+-}
+-
+-static GLboolean enable_tex_cube( GLcontext *ctx, int unit )
++/**
++ * Compute the cached hardware register values for the given texture object.
++ *
++ * \param rmesa Context pointer
++ * \param t the r300 texture object
++ */
++static GLboolean setup_hardware_state(r100ContextPtr rmesa, radeonTexObj *t, int unit)
+ {
+-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+-   struct gl_texture_unit *texUnit = &ctx->Texture.Unit[unit];
+-   struct gl_texture_object *tObj = texUnit->_Current;
+-   radeonTexObjPtr t = (radeonTexObjPtr) tObj->DriverData;
+-   GLuint face;
+-
+-   /* Need to load the 2d images associated with this unit.
+-    */
+-   if (t->pp_txformat & RADEON_TXFORMAT_NON_POWER2) {
+-      t->pp_txformat &= ~RADEON_TXFORMAT_NON_POWER2;
+-      for (face = 0; face < 6; face++)
+-         t->base.dirty_images[face] = ~0;
+-   }
++   const struct gl_texture_image *firstImage;
++   GLint log2Width, log2Height, log2Depth, texelBytes;
+ 
+-   ASSERT(tObj->Target == GL_TEXTURE_CUBE_MAP);
++   firstImage = t->base.Image[0][t->mt->firstLevel];   
+ 
+-   if ( t->base.dirty_images[0] || t->base.dirty_images[1] ||
+-        t->base.dirty_images[2] || t->base.dirty_images[3] ||
+-        t->base.dirty_images[4] || t->base.dirty_images[5] ) {
+-      /* flush */
+-      RADEON_FIREVERTICES( rmesa );
+-      /* layout memory space, once for all faces */
+-      radeonSetTexImages( rmesa, tObj );
++   if (firstImage->Border > 0) {
++      fprintf(stderr, "%s: border\n", __FUNCTION__);
++      return GL_FALSE;
+    }
+ 
+-   /* upload (per face) */
+-   for (face = 0; face < 6; face++) {
+-      if (t->base.dirty_images[face]) {
+-         radeonUploadTexImages( rmesa, (radeonTexObjPtr) tObj->DriverData, face );
++   log2Width  = firstImage->WidthLog2;
++   log2Height = firstImage->HeightLog2;
++   log2Depth  = firstImage->DepthLog2;
++   texelBytes = firstImage->TexFormat->TexelBytes;
++
++   if (!t->image_override) {
++      if (VALID_FORMAT(firstImage->TexFormat->MesaFormat)) {
++	const struct tx_table *table = tx_table;
++
++	 t->pp_txformat &= ~(RADEON_TXFORMAT_FORMAT_MASK |
++			     RADEON_TXFORMAT_ALPHA_IN_MAP);
++	 t->pp_txfilter &= ~RADEON_YUV_TO_RGB;	 
++	 
++	 t->pp_txformat |= table[ firstImage->TexFormat->MesaFormat ].format;
++	 t->pp_txfilter |= table[ firstImage->TexFormat->MesaFormat ].filter;
++      } else {
++	 _mesa_problem(NULL, "unexpected texture format in %s",
++		       __FUNCTION__);
++	 return GL_FALSE;
+       }
+    }
+-      
+-   if ( !t->base.memBlock ) {
+-      /* texmem alloc failed, use s/w fallback */
+-      return GL_FALSE;
++   
++   t->pp_txfilter &= ~RADEON_MAX_MIP_LEVEL_MASK;
++   t->pp_txfilter |= (t->mt->lastLevel - t->mt->firstLevel) << RADEON_MAX_MIP_LEVEL_SHIFT;
++	
++   t->pp_txformat &= ~(RADEON_TXFORMAT_WIDTH_MASK |
++		       RADEON_TXFORMAT_HEIGHT_MASK |
++		       RADEON_TXFORMAT_CUBIC_MAP_ENABLE |
++		       RADEON_TXFORMAT_F5_WIDTH_MASK |
++		       RADEON_TXFORMAT_F5_HEIGHT_MASK);
++   t->pp_txformat |= ((log2Width << RADEON_TXFORMAT_WIDTH_SHIFT) |
++		      (log2Height << RADEON_TXFORMAT_HEIGHT_SHIFT));
++   
++   t->tile_bits = 0;
++   
++   if (t->base.Target == GL_TEXTURE_CUBE_MAP) {
++      ASSERT(log2Width == log2Height);
++      t->pp_txformat |= ((log2Width << RADEON_TXFORMAT_F5_WIDTH_SHIFT) |
++			 (log2Height << RADEON_TXFORMAT_F5_HEIGHT_SHIFT) |
++			 /* don't think we need this bit, if it exists at all - fglrx does not set it */
++			 (RADEON_TXFORMAT_CUBIC_MAP_ENABLE));
++      t->pp_cubic_faces = ((log2Width << RADEON_FACE_WIDTH_1_SHIFT) |
++                           (log2Height << RADEON_FACE_HEIGHT_1_SHIFT) |
++                           (log2Width << RADEON_FACE_WIDTH_2_SHIFT) |
++                           (log2Height << RADEON_FACE_HEIGHT_2_SHIFT) |
++                           (log2Width << RADEON_FACE_WIDTH_3_SHIFT) |
++                           (log2Height << RADEON_FACE_HEIGHT_3_SHIFT) |
++                           (log2Width << RADEON_FACE_WIDTH_4_SHIFT) |
++                           (log2Height << RADEON_FACE_HEIGHT_4_SHIFT));
+    }
+ 
+-   return GL_TRUE;
+-}
++   t->pp_txsize = (((firstImage->Width - 1) << RADEON_TEX_USIZE_SHIFT)
++		   | ((firstImage->Height - 1) << RADEON_TEX_VSIZE_SHIFT));
+ 
+-static GLboolean enable_tex_rect( GLcontext *ctx, int unit )
+-{
+-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+-   struct gl_texture_unit *texUnit = &ctx->Texture.Unit[unit];
+-   struct gl_texture_object *tObj = texUnit->_Current;
+-   radeonTexObjPtr t = (radeonTexObjPtr) tObj->DriverData;
+-
+-   if (!(t->pp_txformat & RADEON_TXFORMAT_NON_POWER2)) {
+-      t->pp_txformat |= RADEON_TXFORMAT_NON_POWER2;
+-      t->base.dirty_images[0] = ~0;
++   if ( !t->image_override ) {
++      if (firstImage->IsCompressed)
++         t->pp_txpitch = (firstImage->Width + 63) & ~(63);
++      else
++         t->pp_txpitch = ((firstImage->Width * texelBytes) + 63) & ~(63);
++      t->pp_txpitch -= 32;
+    }
+ 
+-   ASSERT(tObj->Target == GL_TEXTURE_RECTANGLE_NV);
+-
+-   if ( t->base.dirty_images[0] ) {
+-      RADEON_FIREVERTICES( rmesa );
+-      radeonSetTexImages( rmesa, tObj );
+-      radeonUploadTexImages( rmesa, (radeonTexObjPtr) tObj->DriverData, 0 );
+-      if ( !t->base.memBlock &&
+-           !t->image_override /* && !rmesa->prefer_gart_client_texturing  FIXME */ ) {
+-	 fprintf(stderr, "%s: upload failed\n", __FUNCTION__);
+-	 return GL_FALSE;
+-      }
++   if (t->base.Target == GL_TEXTURE_RECTANGLE_NV) {
++      t->pp_txformat |= RADEON_TXFORMAT_NON_POWER2;
+    }
+ 
+    return GL_TRUE;
+ }
+ 
+-
+-static GLboolean update_tex_common( GLcontext *ctx, int unit )
++static GLboolean radeon_validate_texture(GLcontext *ctx, struct gl_texture_object *texObj, int unit)
+ {
+-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+-   struct gl_texture_unit *texUnit = &ctx->Texture.Unit[unit];
+-   struct gl_texture_object *tObj = texUnit->_Current;
+-   radeonTexObjPtr t = (radeonTexObjPtr) tObj->DriverData;
+-   GLenum format;
++   r100ContextPtr rmesa = R100_CONTEXT(ctx);
++   radeonTexObj *t = radeon_tex_obj(texObj);
++   int ret;
+ 
+-   /* Fallback if there's a texture border */
+-   if ( tObj->Image[0][tObj->BaseLevel]->Border > 0 ) {
+-      fprintf(stderr, "%s: border\n", __FUNCTION__);
++   if (!radeon_validate_texture_miptree(ctx, texObj))
+       return GL_FALSE;
+-   }
++
++   ret = setup_hardware_state(rmesa, t, unit);
++   if (ret == GL_FALSE)
++     return GL_FALSE;
++
+    /* yuv conversion only works in first unit */
+    if (unit != 0 && (t->pp_txfilter & RADEON_YUV_TO_RGB))
+       return GL_FALSE;
+ 
+-   /* Update state if this is a different texture object to last
+-    * time.
+-    */
+-   if ( rmesa->state.texture.unit[unit].texobj != t ) {
+-      if ( rmesa->state.texture.unit[unit].texobj != NULL ) {
+-	 /* The old texture is no longer bound to this texture unit.
+-	  * Mark it as such.
+-	  */
+-
+-	 rmesa->state.texture.unit[unit].texobj->base.bound &= 
+-	     ~(1UL << unit);
+-      }
++   RADEON_STATECHANGE( rmesa, ctx );
++   rmesa->hw.ctx.cmd[CTX_PP_CNTL] |= 
++     (RADEON_TEX_0_ENABLE | RADEON_TEX_BLEND_0_ENABLE) << unit;
+ 
+-      rmesa->state.texture.unit[unit].texobj = t;
+-      t->base.bound |= (1UL << unit);
+-      t->dirty_state |= 1<<unit;
+-      driUpdateTextureLRU( (driTextureObject *) t ); /* XXX: should be locked! */
+-   }
++   RADEON_STATECHANGE( rmesa, tcl );
++   rmesa->hw.tcl.cmd[TCL_OUTPUT_VTXFMT] |= RADEON_ST_BIT(unit);
+ 
++   rmesa->recheck_texgen[unit] = GL_TRUE;
+ 
+-   /* Newly enabled?
+-    */
+-   if ( !(rmesa->hw.ctx.cmd[CTX_PP_CNTL] & (RADEON_TEX_0_ENABLE<<unit))) {
+-      RADEON_STATECHANGE( rmesa, ctx );
+-      rmesa->hw.ctx.cmd[CTX_PP_CNTL] |= 
+-	  (RADEON_TEX_0_ENABLE | RADEON_TEX_BLEND_0_ENABLE) << unit;
+-
+-      RADEON_STATECHANGE( rmesa, tcl );
+-
+-      rmesa->hw.tcl.cmd[TCL_OUTPUT_VTXFMT] |= RADEON_ST_BIT(unit);
+-
+-      rmesa->recheck_texgen[unit] = GL_TRUE;
+-   }
+-
+-   if (t->dirty_state & (1<<unit)) {
+-      import_tex_obj_state( rmesa, unit, t );
+-      /* may need to update texture matrix (for texrect adjustments) */
+-      rmesa->NewGLState |= _NEW_TEXTURE_MATRIX;
+-   }
++   import_tex_obj_state( rmesa, unit, t );
+ 
+    if (rmesa->recheck_texgen[unit]) {
+       GLboolean fallback = !radeon_validate_texgen( ctx, unit );
+       TCL_FALLBACK( ctx, (RADEON_TCL_FALLBACK_TEXGEN_0<<unit), fallback);
+       rmesa->recheck_texgen[unit] = 0;
+-      rmesa->NewGLState |= _NEW_TEXTURE_MATRIX;
++      rmesa->radeon.NewGLState |= _NEW_TEXTURE_MATRIX;
+    }
+ 
+-   format = tObj->Image[0][tObj->BaseLevel]->_BaseFormat;
+-   if ( rmesa->state.texture.unit[unit].format != format ||
+-	rmesa->state.texture.unit[unit].envMode != texUnit->EnvMode ) {
+-      rmesa->state.texture.unit[unit].format = format;
+-      rmesa->state.texture.unit[unit].envMode = texUnit->EnvMode;
+-      if ( ! radeonUpdateTextureEnv( ctx, unit ) ) {
+-	 return GL_FALSE;
+-      }
++   if ( ! radeonUpdateTextureEnv( ctx, unit ) ) {
++     return GL_FALSE;
+    }
+-
+    FALLBACK( rmesa, RADEON_FALLBACK_BORDER_MODE, t->border_fallback );
++
++   t->validated = GL_TRUE;
+    return !t->border_fallback;
+ }
+ 
+-
+-
+ static GLboolean radeonUpdateTextureUnit( GLcontext *ctx, int unit )
+ {
++   r100ContextPtr rmesa = R100_CONTEXT(ctx);
+    struct gl_texture_unit *texUnit = &ctx->Texture.Unit[unit];
+ 
+-   if ( texUnit->_ReallyEnabled & (TEXTURE_RECT_BIT) ) {
+-      return (enable_tex_rect( ctx, unit ) &&
+-	      update_tex_common( ctx, unit ));
+-   }
+-   else if ( texUnit->_ReallyEnabled & (TEXTURE_1D_BIT | TEXTURE_2D_BIT) ) {
+-      return (enable_tex_2d( ctx, unit ) &&
+-	      update_tex_common( ctx, unit ));
+-   }
+-   else if ( texUnit->_ReallyEnabled & (TEXTURE_CUBE_BIT) ) {
+-      return (enable_tex_cube( ctx, unit ) &&
+-	      update_tex_common( ctx, unit ));
++
++   if (ctx->Texture.Unit[unit]._ReallyEnabled & TEXTURE_3D_BIT) {
++     return GL_FALSE;
+    }
+-   else if ( texUnit->_ReallyEnabled ) {
+-      return GL_FALSE;
++
++   if (!ctx->Texture.Unit[unit]._ReallyEnabled) {
++     /* disable the unit */
++     disable_tex_obj_state(rmesa, unit);
++     return GL_TRUE;
+    }
+-   else {
+-      disable_tex( ctx, unit );
+-      return GL_TRUE;
++
++   if (!radeon_validate_texture(ctx, ctx->Texture.Unit[unit]._Current, unit)) {
++    _mesa_warning(ctx,
++		  "failed to validate texture for unit %d.\n",
++		  unit);
++    rmesa->state.texture.unit[unit].texobj = NULL;
++    return GL_FALSE;
+    }
++   rmesa->state.texture.unit[unit].texobj = radeon_tex_obj(ctx->Texture.Unit[unit]._Current);
++   return GL_TRUE;
+ }
+ 
+ void radeonUpdateTextureState( GLcontext *ctx )
+ {
+-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
++   r100ContextPtr rmesa = R100_CONTEXT(ctx);
+    GLboolean ok;
+ 
++   /* set the ctx all textures off */
++   RADEON_STATECHANGE( rmesa, ctx );
++   rmesa->hw.ctx.cmd[CTX_PP_CNTL] &= ~((RADEON_TEX_ENABLE_MASK) | (RADEON_TEX_BLEND_ENABLE_MASK));
++
+    ok = (radeonUpdateTextureUnit( ctx, 0 ) &&
+ 	 radeonUpdateTextureUnit( ctx, 1 ) &&
+ 	 radeonUpdateTextureUnit( ctx, 2 ));
+ 
+    FALLBACK( rmesa, RADEON_FALLBACK_TEXTURE, !ok );
+ 
+-   if (rmesa->TclFallback)
++   if (rmesa->radeon.TclFallback)
+       radeonChooseVertexState( ctx );
+ }
 diff --git a/src/mesa/drivers/dri/radeon/radeon_texture.c b/src/mesa/drivers/dri/radeon/radeon_texture.c
 new file mode 100644
 index 0000000..63680b4
@@ -6081,6 +35305,349 @@ index 0000000..d90fda7
 +				 struct gl_texture_image *texImage);
 +
 +#endif
--- 
-1.6.0.3
-
+diff --git a/src/mesa/drivers/dri/radeon/server/radeon_reg.h b/src/mesa/drivers/dri/radeon/server/radeon_reg.h
+index 596a8aa..0df634b 100644
+--- a/src/mesa/drivers/dri/radeon/server/radeon_reg.h
++++ b/src/mesa/drivers/dri/radeon/server/radeon_reg.h
+@@ -2031,6 +2031,9 @@
+ #define RADEON_CP_PACKET3_3D_DRAW_INDX              0xC0002A00
+ #define RADEON_CP_PACKET3_LOAD_PALETTE              0xC0002C00
+ #define RADEON_CP_PACKET3_3D_LOAD_VBPNTR            0xC0002F00
++#define R200_CP_CMD_3D_DRAW_VBUF_2      0xC0003400
++#define R200_CP_CMD_3D_DRAW_IMMD_2      0xC0003500
++#define R200_CP_CMD_3D_DRAW_INDX_2      0xC0003600
+ #define RADEON_CP_PACKET3_CNTL_PAINT                0xC0009100
+ #define RADEON_CP_PACKET3_CNTL_BITBLT               0xC0009200
+ #define RADEON_CP_PACKET3_CNTL_SMALLTEXT            0xC0009300
+diff --git a/src/mesa/main/enable.c b/src/mesa/main/enable.c
+index dae576a..1c02617 100644
+--- a/src/mesa/main/enable.c
++++ b/src/mesa/main/enable.c
+@@ -922,10 +922,13 @@ _mesa_set_enable(GLcontext *ctx, GLenum cap, GLboolean state)
+             return;
+          FLUSH_VERTICES(ctx, _NEW_STENCIL);
+          ctx->Stencil.TestTwoSide = state;
+-         if (state)
++         if (state) {
++            ctx->Stencil._BackFace = 2;
+             ctx->_TriangleCaps |= DD_TRI_TWOSTENCIL;
+-         else
++         } else {
++            ctx->Stencil._BackFace = 1;
+             ctx->_TriangleCaps &= ~DD_TRI_TWOSTENCIL;
++         }
+          break;
+ 
+ #if FEATURE_ARB_fragment_program
+diff --git a/src/mesa/main/getstring.c b/src/mesa/main/getstring.c
+index e1008d7..a42c446 100644
+--- a/src/mesa/main/getstring.c
++++ b/src/mesa/main/getstring.c
+@@ -82,7 +82,16 @@ compute_version(const GLcontext *ctx)
+                               ctx->Extensions.ARB_vertex_shader &&
+                               ctx->Extensions.ARB_fragment_shader &&
+                               ctx->Extensions.ARB_texture_non_power_of_two &&
+-                              ctx->Extensions.EXT_blend_equation_separate);
++                              ctx->Extensions.EXT_blend_equation_separate &&
++
++			      /* Technically, 2.0 requires the functionality
++			       * of the EXT version.  Enable 2.0 if either
++			       * extension is available, and assume that a
++			       * driver that only exposes the ATI extension
++			       * will fallback to software when necessary.
++			       */
++			      (ctx->Extensions.EXT_stencil_two_side
++			       || ctx->Extensions.ATI_separate_stencil));
+    const GLboolean ver_2_1 = (ver_2_0 &&
+                               ctx->Extensions.ARB_shading_language_120 &&
+                               ctx->Extensions.EXT_pixel_buffer_object &&
+diff --git a/src/mesa/main/mtypes.h b/src/mesa/main/mtypes.h
+index 2014745..144c61e 100644
+--- a/src/mesa/main/mtypes.h
++++ b/src/mesa/main/mtypes.h
+@@ -1108,20 +1108,34 @@ struct gl_scissor_attrib
+ 
+ /**
+  * Stencil attribute group (GL_STENCIL_BUFFER_BIT).
++ *
++ * Three sets of stencil data are tracked so that OpenGL 2.0,
++ * GL_EXT_stencil_two_side, and GL_ATI_separate_stencil can all be supported
++ * simultaneously.  In each of the stencil state arrays, element 0 corresponds
++ * to GL_FRONT.  Element 1 corresponds to the OpenGL 2.0 /
++ * GL_ATI_separate_stencil GL_BACK state.  Element 2 corresponds to the
++ * GL_EXT_stencil_two_side GL_BACK state.
++ *
++ * The derived value \c _BackFace is either 1 or 2 depending on whether or
++ * not GL_STENCIL_TEST_TWO_SIDE_EXT is enabled.
++ *
++ * The derived value \c _TestTwoSide is set when the front-face and back-face
++ * stencil state are different.
+  */
+ struct gl_stencil_attrib
+ {
+    GLboolean Enabled;		/**< Enabled flag */
+    GLboolean TestTwoSide;	/**< GL_EXT_stencil_two_side */
+-   GLubyte ActiveFace;		/**< GL_EXT_stencil_two_side (0 or 1) */
++   GLubyte ActiveFace;		/**< GL_EXT_stencil_two_side (0 or 2) */
+    GLboolean _TestTwoSide;
+-   GLenum Function[2];		/**< Stencil function */
+-   GLenum FailFunc[2];		/**< Fail function */
+-   GLenum ZPassFunc[2];		/**< Depth buffer pass function */
+-   GLenum ZFailFunc[2];		/**< Depth buffer fail function */
+-   GLint Ref[2];		/**< Reference value */
+-   GLuint ValueMask[2];		/**< Value mask */
+-   GLuint WriteMask[2];		/**< Write mask */
++   GLubyte _BackFace;
++   GLenum Function[3];		/**< Stencil function */
++   GLenum FailFunc[3];		/**< Fail function */
++   GLenum ZPassFunc[3];		/**< Depth buffer pass function */
++   GLenum ZFailFunc[3];		/**< Depth buffer fail function */
++   GLint Ref[3];		/**< Reference value */
++   GLuint ValueMask[3];		/**< Value mask */
++   GLuint WriteMask[3];		/**< Write mask */
+    GLuint Clear;		/**< Clear value */
+ };
+ 
+diff --git a/src/mesa/main/stencil.c b/src/mesa/main/stencil.c
+index 2a4c38b..b4ea997 100644
+--- a/src/mesa/main/stencil.c
++++ b/src/mesa/main/stencil.c
+@@ -27,21 +27,6 @@
+  * \file stencil.c
+  * Stencil operations.
+  *
+- * Note: There's an incompatibility between GL_EXT_stencil_two_side and
+- * OpenGL 2.0's two-sided stencil feature.
+- *
+- * With GL_EXT_stencil_two_side, calling glStencilOp/Func/Mask() only the
+- * front OR back face state (as set by glActiveStencilFaceEXT) is set.
+- *
+- * But with OpenGL 2.0, calling glStencilOp/Func/Mask() sets BOTH the
+- * front AND back state.
+- *
+- * So either we advertise the GL_EXT_stencil_two_side extension, or OpenGL
+- * 2.0, but not both.
+- *
+- * Also, note that GL_ATI_separate_stencil is different as well:
+- * glStencilFuncSeparateATI(GLenum frontfunc, GLenum backfunc, ...)  vs.
+- * glStencilFuncSeparate(GLenum face, GLenum func, ...).
+  */
+ 
+ 
+@@ -198,6 +183,7 @@ _mesa_StencilFunc( GLenum func, GLint ref, GLuint mask )
+ {
+    GET_CURRENT_CONTEXT(ctx);
+    const GLint stencilMax = (1 << ctx->DrawBuffer->Visual.stencilBits) - 1;
++   const GLint face = ctx->Stencil.ActiveFace;
+    ASSERT_OUTSIDE_BEGIN_END(ctx);
+ 
+    if (!validate_stencil_func(ctx, func)) {
+@@ -207,9 +193,7 @@ _mesa_StencilFunc( GLenum func, GLint ref, GLuint mask )
+ 
+    ref = CLAMP( ref, 0, stencilMax );
+ 
+-   if (ctx->Extensions.EXT_stencil_two_side) {
+-      /* only set active face state */
+-      const GLint face = ctx->Stencil.ActiveFace;
++   if (face != 0) {
+       if (ctx->Stencil.Function[face] == func &&
+           ctx->Stencil.ValueMask[face] == mask &&
+           ctx->Stencil.Ref[face] == ref)
+@@ -218,9 +202,12 @@ _mesa_StencilFunc( GLenum func, GLint ref, GLuint mask )
+       ctx->Stencil.Function[face] = func;
+       ctx->Stencil.Ref[face] = ref;
+       ctx->Stencil.ValueMask[face] = mask;
+-      if (ctx->Driver.StencilFuncSeparate) {
+-         ctx->Driver.StencilFuncSeparate(ctx, face ? GL_BACK : GL_FRONT,
+-                                         func, ref, mask);
++
++      /* Only propagate the change to the driver if EXT_stencil_two_side
++       * is enabled.
++       */
++      if (ctx->Driver.StencilFuncSeparate && ctx->Stencil.TestTwoSide) {
++         ctx->Driver.StencilFuncSeparate(ctx, GL_BACK, func, ref, mask);
+       }
+    }
+    else {
+@@ -237,7 +224,9 @@ _mesa_StencilFunc( GLenum func, GLint ref, GLuint mask )
+       ctx->Stencil.Ref[0]       = ctx->Stencil.Ref[1]       = ref;
+       ctx->Stencil.ValueMask[0] = ctx->Stencil.ValueMask[1] = mask;
+       if (ctx->Driver.StencilFuncSeparate) {
+-         ctx->Driver.StencilFuncSeparate(ctx, GL_FRONT_AND_BACK,
++         ctx->Driver.StencilFuncSeparate(ctx,
++					 ((ctx->Stencil.TestTwoSide)
++					  ? GL_FRONT : GL_FRONT_AND_BACK),
+                                          func, ref, mask);
+       }
+    }
+@@ -259,17 +248,23 @@ void GLAPIENTRY
+ _mesa_StencilMask( GLuint mask )
+ {
+    GET_CURRENT_CONTEXT(ctx);
++   const GLint face = ctx->Stencil.ActiveFace;
++
+    ASSERT_OUTSIDE_BEGIN_END(ctx);
+ 
+-   if (ctx->Extensions.EXT_stencil_two_side) {
+-      /* only set active face state */
+-      const GLint face = ctx->Stencil.ActiveFace;
++   if (face != 0) {
++      /* Only modify the EXT_stencil_two_side back-face state.
++       */
+       if (ctx->Stencil.WriteMask[face] == mask)
+          return;
+       FLUSH_VERTICES(ctx, _NEW_STENCIL);
+       ctx->Stencil.WriteMask[face] = mask;
+-      if (ctx->Driver.StencilMaskSeparate) {
+-         ctx->Driver.StencilMaskSeparate(ctx, face ? GL_BACK : GL_FRONT, mask);
++
++      /* Only propagate the change to the driver if EXT_stencil_two_side
++       * is enabled.
++       */
++      if (ctx->Driver.StencilMaskSeparate && ctx->Stencil.TestTwoSide) {
++         ctx->Driver.StencilMaskSeparate(ctx, GL_BACK, mask);
+       }
+    }
+    else {
+@@ -280,7 +275,10 @@ _mesa_StencilMask( GLuint mask )
+       FLUSH_VERTICES(ctx, _NEW_STENCIL);
+       ctx->Stencil.WriteMask[0] = ctx->Stencil.WriteMask[1] = mask;
+       if (ctx->Driver.StencilMaskSeparate) {
+-         ctx->Driver.StencilMaskSeparate(ctx, GL_FRONT_AND_BACK, mask);
++         ctx->Driver.StencilMaskSeparate(ctx,
++					 ((ctx->Stencil.TestTwoSide)
++					  ? GL_FRONT : GL_FRONT_AND_BACK),
++					  mask);
+       }
+    }
+ }
+@@ -304,6 +302,8 @@ void GLAPIENTRY
+ _mesa_StencilOp(GLenum fail, GLenum zfail, GLenum zpass)
+ {
+    GET_CURRENT_CONTEXT(ctx);
++   const GLint face = ctx->Stencil.ActiveFace;
++
+    ASSERT_OUTSIDE_BEGIN_END(ctx);
+ 
+    if (!validate_stencil_op(ctx, fail)) {
+@@ -319,9 +319,8 @@ _mesa_StencilOp(GLenum fail, GLenum zfail, GLenum zpass)
+       return;
+    }
+ 
+-   if (ctx->Extensions.EXT_stencil_two_side) {
++   if (face != 0) {
+       /* only set active face state */
+-      const GLint face = ctx->Stencil.ActiveFace;
+       if (ctx->Stencil.ZFailFunc[face] == zfail &&
+           ctx->Stencil.ZPassFunc[face] == zpass &&
+           ctx->Stencil.FailFunc[face] == fail)
+@@ -330,9 +329,12 @@ _mesa_StencilOp(GLenum fail, GLenum zfail, GLenum zpass)
+       ctx->Stencil.ZFailFunc[face] = zfail;
+       ctx->Stencil.ZPassFunc[face] = zpass;
+       ctx->Stencil.FailFunc[face] = fail;
+-      if (ctx->Driver.StencilOpSeparate) {
+-         ctx->Driver.StencilOpSeparate(ctx, face ? GL_BACK : GL_FRONT,
+-                                       fail, zfail, zpass);
++
++      /* Only propagate the change to the driver if EXT_stencil_two_side
++       * is enabled.
++       */
++      if (ctx->Driver.StencilOpSeparate && ctx->Stencil.TestTwoSide) {
++         ctx->Driver.StencilOpSeparate(ctx, GL_BACK, fail, zfail, zpass);
+       }
+    }
+    else {
+@@ -349,7 +351,9 @@ _mesa_StencilOp(GLenum fail, GLenum zfail, GLenum zpass)
+       ctx->Stencil.ZPassFunc[0] = ctx->Stencil.ZPassFunc[1] = zpass;
+       ctx->Stencil.FailFunc[0]  = ctx->Stencil.FailFunc[1]  = fail;
+       if (ctx->Driver.StencilOpSeparate) {
+-         ctx->Driver.StencilOpSeparate(ctx, GL_FRONT_AND_BACK,
++         ctx->Driver.StencilOpSeparate(ctx,
++				       ((ctx->Stencil.TestTwoSide)
++					? GL_FRONT : GL_FRONT_AND_BACK),
+                                        fail, zfail, zpass);
+       }
+    }
+@@ -372,7 +376,7 @@ _mesa_ActiveStencilFaceEXT(GLenum face)
+ 
+    if (face == GL_FRONT || face == GL_BACK) {
+       FLUSH_VERTICES(ctx, _NEW_STENCIL);
+-      ctx->Stencil.ActiveFace = (face == GL_FRONT) ? 0 : 1;
++      ctx->Stencil.ActiveFace = (face == GL_FRONT) ? 0 : 2;
+    }
+    else {
+       _mesa_error(ctx, GL_INVALID_ENUM, "glActiveStencilFaceEXT(face)");
+@@ -513,19 +517,16 @@ _mesa_StencilMaskSeparate(GLenum face, GLuint mask)
+ void
+ _mesa_update_stencil(GLcontext *ctx)
+ {
+-   if (ctx->Extensions.EXT_stencil_two_side) {
+-      ctx->Stencil._TestTwoSide = ctx->Stencil.TestTwoSide;
+-   }
+-   else {
+-      ctx->Stencil._TestTwoSide = 
+-         (ctx->Stencil.Function[0] != ctx->Stencil.Function[1] ||
+-          ctx->Stencil.FailFunc[0] != ctx->Stencil.FailFunc[1] ||
+-          ctx->Stencil.ZPassFunc[0] != ctx->Stencil.ZPassFunc[1] ||
+-          ctx->Stencil.ZFailFunc[0] != ctx->Stencil.ZFailFunc[1] ||
+-          ctx->Stencil.Ref[0] != ctx->Stencil.Ref[1] ||
+-          ctx->Stencil.ValueMask[0] != ctx->Stencil.ValueMask[1] ||
+-          ctx->Stencil.WriteMask[0] != ctx->Stencil.WriteMask[1]);
+-   }
++   const GLint face = ctx->Stencil._BackFace;
++
++   ctx->Stencil._TestTwoSide =
++       (ctx->Stencil.Function[0] != ctx->Stencil.Function[face] ||
++	ctx->Stencil.FailFunc[0] != ctx->Stencil.FailFunc[face] ||
++	ctx->Stencil.ZPassFunc[0] != ctx->Stencil.ZPassFunc[face] ||
++	ctx->Stencil.ZFailFunc[0] != ctx->Stencil.ZFailFunc[face] ||
++	ctx->Stencil.Ref[0] != ctx->Stencil.Ref[face] ||
++	ctx->Stencil.ValueMask[0] != ctx->Stencil.ValueMask[face] ||
++	ctx->Stencil.WriteMask[0] != ctx->Stencil.WriteMask[face]);
+ }
+ 
+ 
+@@ -544,17 +545,24 @@ _mesa_init_stencil(GLcontext *ctx)
+    ctx->Stencil.ActiveFace = 0;  /* 0 = GL_FRONT, 1 = GL_BACK */
+    ctx->Stencil.Function[0] = GL_ALWAYS;
+    ctx->Stencil.Function[1] = GL_ALWAYS;
++   ctx->Stencil.Function[2] = GL_ALWAYS;
+    ctx->Stencil.FailFunc[0] = GL_KEEP;
+    ctx->Stencil.FailFunc[1] = GL_KEEP;
++   ctx->Stencil.FailFunc[2] = GL_KEEP;
+    ctx->Stencil.ZPassFunc[0] = GL_KEEP;
+    ctx->Stencil.ZPassFunc[1] = GL_KEEP;
++   ctx->Stencil.ZPassFunc[2] = GL_KEEP;
+    ctx->Stencil.ZFailFunc[0] = GL_KEEP;
+    ctx->Stencil.ZFailFunc[1] = GL_KEEP;
++   ctx->Stencil.ZFailFunc[2] = GL_KEEP;
+    ctx->Stencil.Ref[0] = 0;
+    ctx->Stencil.Ref[1] = 0;
++   ctx->Stencil.Ref[2] = 0;
+    ctx->Stencil.ValueMask[0] = ~0U;
+    ctx->Stencil.ValueMask[1] = ~0U;
++   ctx->Stencil.ValueMask[2] = ~0U;
+    ctx->Stencil.WriteMask[0] = ~0U;
+    ctx->Stencil.WriteMask[1] = ~0U;
++   ctx->Stencil.WriteMask[2] = ~0U;
+    ctx->Stencil.Clear = 0;
+ }
+diff --git a/src/mesa/swrast/s_stencil.c b/src/mesa/swrast/s_stencil.c
+index c925922..2e84dde 100644
+--- a/src/mesa/swrast/s_stencil.c
++++ b/src/mesa/swrast/s_stencil.c
+@@ -997,10 +997,12 @@ stencil_and_ztest_pixels( GLcontext *ctx, SWspan *span, GLuint face )
+ GLboolean
+ _swrast_stencil_and_ztest_span(GLcontext *ctx, SWspan *span)
+ {
++   const GLuint face = (span->facing == 0) ? 0 : ctx->Stencil._BackFace;
++
+    if (span->arrayMask & SPAN_XY)
+-      return stencil_and_ztest_pixels(ctx, span, span->facing);
++      return stencil_and_ztest_pixels(ctx, span, face);
+    else
+-      return stencil_and_ztest_span(ctx, span, span->facing);
++      return stencil_and_ztest_span(ctx, span, face);
+ }
+ 
+