
#include "dixstruct.h"

#include "xaa.h"
#include "xaalocal.h"

#ifndef RENDER_GENERIC_HELPER
#define RENDER_GENERIC_HELPER

static void RadeonInit3DEngineMMIO(ScrnInfoPtr pScrn);
#ifdef XF86DRI
static void RadeonInit3DEngineCP(ScrnInfoPtr pScrn);
#endif

struct blendinfo {
	Bool dst_alpha;
	Bool src_alpha;
	CARD32 blend_cntl;
};

/* The first part of blend_cntl corresponds to Fa from the render "protocol"
 * document, and the second part to Fb.
 */
static const struct blendinfo RadeonBlendOp[] = {
    /* Clear */
    {0, 0, RADEON_SRC_BLEND_GL_ZERO |
	   RADEON_DST_BLEND_GL_ZERO},
    /* Src */
    {0, 0, RADEON_SRC_BLEND_GL_ONE |
	   RADEON_DST_BLEND_GL_ZERO},
    /* Dst */
    {0, 0, RADEON_SRC_BLEND_GL_ZERO |
	   RADEON_DST_BLEND_GL_ONE},
    /* Over */
    {0, 1, RADEON_SRC_BLEND_GL_ONE |
	   RADEON_DST_BLEND_GL_ONE_MINUS_SRC_ALPHA},
    /* OverReverse */
    {1, 0, RADEON_SRC_BLEND_GL_ONE_MINUS_DST_ALPHA |
	   RADEON_DST_BLEND_GL_ONE},
    /* In */
    {1, 0, RADEON_SRC_BLEND_GL_DST_ALPHA |
	   RADEON_DST_BLEND_GL_ZERO},
    /* InReverse */
    {0, 1, RADEON_SRC_BLEND_GL_ZERO |
	   RADEON_DST_BLEND_GL_SRC_ALPHA},
    /* Out */
    {1, 0, RADEON_SRC_BLEND_GL_ONE_MINUS_DST_ALPHA |
	   RADEON_DST_BLEND_GL_ZERO},
    /* OutReverse */
    {0, 1, RADEON_SRC_BLEND_GL_ZERO |
	   RADEON_DST_BLEND_GL_ONE_MINUS_SRC_ALPHA},
    /* Atop */
    {1, 1, RADEON_SRC_BLEND_GL_DST_ALPHA |
	   RADEON_DST_BLEND_GL_ONE_MINUS_SRC_ALPHA},
    /* AtopReverse */
    {1, 1, RADEON_SRC_BLEND_GL_ONE_MINUS_DST_ALPHA |
	   RADEON_DST_BLEND_GL_SRC_ALPHA},
    /* Xor */
    {1, 1, RADEON_SRC_BLEND_GL_ONE_MINUS_DST_ALPHA |
	   RADEON_DST_BLEND_GL_ONE_MINUS_SRC_ALPHA},
    /* Add */
    {0, 0, RADEON_SRC_BLEND_GL_ONE |
	   RADEON_DST_BLEND_GL_ONE},
    /* Saturate */
    {1, 1, RADEON_SRC_BLEND_GL_SRC_ALPHA_SATURATE |
	   RADEON_DST_BLEND_GL_ONE},
    {0, 0, 0},
    {0, 0, 0},
    /* DisjointClear */
    {0, 0, RADEON_SRC_BLEND_GL_ZERO |
	   RADEON_DST_BLEND_GL_ZERO},
    /* DisjointSrc */
    {0, 0, RADEON_SRC_BLEND_GL_ONE |
	   RADEON_DST_BLEND_GL_ZERO},
    /* DisjointDst */
    {0, 0, RADEON_SRC_BLEND_GL_ZERO |
	   RADEON_DST_BLEND_GL_ONE},
    /* DisjointOver unsupported */
    {0, 0, 0},
    /* DisjointOverReverse */
    {1, 1, RADEON_SRC_BLEND_GL_SRC_ALPHA_SATURATE |
	   RADEON_DST_BLEND_GL_ONE},
    /* DisjointIn unsupported */
    {0, 0, 0},
    /* DisjointInReverse unsupported */
    {0, 0, 0},
    /* DisjointOut unsupported */
    {1, 1, RADEON_SRC_BLEND_GL_SRC_ALPHA_SATURATE |
	   RADEON_DST_BLEND_GL_ZERO},
    /* DisjointOutReverse unsupported */
    {0, 0, 0},
    /* DisjointAtop unsupported */
    {0, 0, 0},
    /* DisjointAtopReverse unsupported */
    {0, 0, 0},
    /* DisjointXor unsupported */
    {0, 0, 0},
    {0, 0, 0},
    {0, 0, 0},
    {0, 0, 0},
    {0, 0, 0},
    /* ConjointClear */
    {0, 0, RADEON_SRC_BLEND_GL_ZERO |
	   RADEON_DST_BLEND_GL_ZERO},
    /* ConjointSrc */
    {0, 0, RADEON_SRC_BLEND_GL_ONE |
	   RADEON_DST_BLEND_GL_ZERO},
    /* ConjointDst */
    {0, 0, RADEON_SRC_BLEND_GL_ZERO |
	   RADEON_DST_BLEND_GL_ONE},
};
#define RadeonOpMax (sizeof(RadeonBlendOp) / sizeof(RadeonBlendOp[0]))

/* Note on texture formats:
 * TXFORMAT_Y8 expands to (Y,Y,Y,1).  TXFORMAT_I8 expands to (I,I,I,I)
 * The RADEON and R200 TXFORMATS we use are the same on r100/r200.
 */

static CARD32 RADEONTextureFormats[] = {
    PICT_a8r8g8b8,
    PICT_a8,
    PICT_x8r8g8b8,
    PICT_r5g6b5,
    PICT_a1r5g5b5,
    PICT_x1r5g5b5,
    0
};

static CARD32 RADEONDstFormats[] = {
    PICT_a8r8g8b8,
    PICT_x8r8g8b8,
    PICT_r5g6b5,
    PICT_a1r5g5b5,
    PICT_x1r5g5b5,
    0
};

static CARD32
RadeonGetTextureFormat(CARD32 format)
{
    switch (format) {
    case PICT_a8r8g8b8:
	return RADEON_TXFORMAT_ARGB8888 | RADEON_TXFORMAT_ALPHA_IN_MAP;
    case PICT_a8:
	return RADEON_TXFORMAT_I8 | RADEON_TXFORMAT_ALPHA_IN_MAP;
    case PICT_x8r8g8b8:
	return RADEON_TXFORMAT_ARGB8888;
    case PICT_r5g6b5:
	return RADEON_TXFORMAT_RGB565;
    case PICT_a1r5g5b5:
	return RADEON_TXFORMAT_ARGB1555 | RADEON_TXFORMAT_ALPHA_IN_MAP;
    case PICT_x1r5g5b5:
	return RADEON_TXFORMAT_ARGB1555;
    default:
	return 0;
    }
}

static CARD32
RadeonGetColorFormat(CARD32 format)
{
    switch (format) {
    case PICT_a8r8g8b8:
    case PICT_x8r8g8b8:
	return RADEON_COLOR_FORMAT_ARGB8888;
    case PICT_r5g6b5:
	return RADEON_COLOR_FORMAT_RGB565;
    case PICT_a1r5g5b5:
    case PICT_x1r5g5b5:
	return RADEON_COLOR_FORMAT_ARGB1555;
    default:
	return 0;
    }
}

/* Returns a RADEON_RB3D_BLENDCNTL value, or 0 if the operation is not
 * supported
 */
static CARD32
RadeonGetBlendCntl(CARD8 op, CARD32 dstFormat)
{
    CARD32 blend_cntl;

    if (op >= RadeonOpMax || RadeonBlendOp[op].blend_cntl == 0)
	return 0;

    blend_cntl = RadeonBlendOp[op].blend_cntl;
	
    if (RadeonBlendOp[op].dst_alpha && !PICT_FORMAT_A(dstFormat)) {
	CARD32 srcblend = blend_cntl & RADEON_SRC_BLEND_MASK;

	/* If there's no destination alpha channel, we need to wire the blending
	 * to treat the alpha channel as always 1.
	 */
	if (srcblend == RADEON_SRC_BLEND_GL_ONE_MINUS_DST_ALPHA ||
	    srcblend == RADEON_SRC_BLEND_GL_SRC_ALPHA_SATURATE)
	    blend_cntl = (blend_cntl & ~RADEON_SRC_BLEND_MASK) |
			 RADEON_SRC_BLEND_GL_ZERO;
	else if (srcblend == RADEON_SRC_BLEND_GL_DST_ALPHA)
	    blend_cntl = (blend_cntl & ~RADEON_SRC_BLEND_MASK) |
			 RADEON_SRC_BLEND_GL_ONE;
    }

    return blend_cntl;
}

static __inline__ CARD32 F_TO_DW(float val)
{
    union {
	float f;
	CARD32 l;
    } tmp;
    tmp.f = val;
    return tmp.l;
}

/* Compute log base 2 of val. */
static __inline__ int
ATILog2(int val)
{
	int bits;

	for (bits = 0; val != 0; val >>= 1, ++bits)
		;
	return bits - 1;
}

static void RadeonInit3DEngine(ScrnInfoPtr pScrn)
{
    RADEONInfoPtr info = RADEONPTR (pScrn);

#ifdef XF86DRI
    if (info->directRenderingEnabled) {
	RADEONSAREAPrivPtr pSAREAPriv;

	pSAREAPriv = DRIGetSAREAPrivate(pScrn->pScreen);
	pSAREAPriv->ctxOwner = DRIGetContext(pScrn->pScreen);
	RadeonInit3DEngineCP(pScrn);
    } else
#endif
	RadeonInit3DEngineMMIO(pScrn);

    info->RenderInited3D = TRUE;
}

static void
RemoveLinear (FBLinearPtr linear)
{
   RADEONInfoPtr info = (RADEONInfoPtr)(linear->devPrivate.ptr);

   info->RenderTex = NULL; 
}

static void
RenderCallback (ScrnInfoPtr pScrn)
{
    RADEONInfoPtr  info       = RADEONPTR(pScrn);

    if ((currentTime.milliseconds > info->RenderTimeout) && info->RenderTex) {
	xf86FreeOffscreenLinear(info->RenderTex);
	info->RenderTex = NULL;
    }

    if (!info->RenderTex)
	info->RenderCallback = NULL;
}

static Bool
AllocateLinear (
   ScrnInfoPtr pScrn,
   int sizeNeeded
){
    RADEONInfoPtr  info       = RADEONPTR(pScrn);

   info->RenderTimeout = currentTime.milliseconds + 30000;
   info->RenderCallback = RenderCallback;

   if (info->RenderTex) {
	if (info->RenderTex->size >= sizeNeeded)
	   return TRUE;
	else {
	   if (xf86ResizeOffscreenLinear(info->RenderTex, sizeNeeded))
		return TRUE;

	   xf86FreeOffscreenLinear(info->RenderTex);
	   info->RenderTex = NULL;
	}
   }

   info->RenderTex = xf86AllocateOffscreenLinear(pScrn->pScreen, sizeNeeded, 32,
						 NULL, RemoveLinear, info);

   return (info->RenderTex != NULL);
}

#if X_BYTE_ORDER == X_BIG_ENDIAN
static Bool RADEONSetupRenderByteswap(ScrnInfoPtr pScrn, int tex_bytepp)
{
    RADEONInfoPtr info = RADEONPTR(pScrn);
    unsigned char *RADEONMMIO = info->MMIO;

    /* Set up byte swapping for the framebuffer aperture as needed */
    switch (tex_bytepp) {
    case 1:
	OUTREG(RADEON_SURFACE_CNTL, info->ModeReg.surface_cntl &
				    ~(RADEON_NONSURF_AP0_SWP_32BPP
				    | RADEON_NONSURF_AP0_SWP_16BPP));
	break;
    case 2:
	OUTREG(RADEON_SURFACE_CNTL, (info->ModeReg.surface_cntl &
				     ~RADEON_NONSURF_AP0_SWP_32BPP)
				   | RADEON_NONSURF_AP0_SWP_16BPP);
	break;
    case 4:
	OUTREG(RADEON_SURFACE_CNTL, (info->ModeReg.surface_cntl &
				     ~RADEON_NONSURF_AP0_SWP_16BPP)
				   | RADEON_NONSURF_AP0_SWP_32BPP);
	break;
    default:
	xf86DrvMsg(pScrn->scrnIndex, X_ERROR, "%s: Don't know what to do for "
		   "tex_bytepp == %d!\n", __func__, tex_bytepp);
	return FALSE;
    }
}

static void RADEONRestoreByteswap(RADEONInfoPtr info)
{
    unsigned char *RADEONMMIO = info->MMIO;

    OUTREG(RADEON_SURFACE_CNTL, info->ModeReg.surface_cntl);
}
#endif	/* X_BYTE_ORDER == X_BIG_ENDIAN */

#endif	/* RENDER_GENERIC_HELPER */

#if defined(ACCEL_MMIO) && defined(ACCEL_CP)
#error Cannot define both MMIO and CP acceleration!
#endif

#if !defined(UNIXCPP) || defined(ANSICPP)
#define FUNC_NAME_CAT(prefix,suffix) prefix##suffix
#else
#define FUNC_NAME_CAT(prefix,suffix) prefix/**/suffix
#endif

#ifdef ACCEL_MMIO
#define FUNC_NAME(prefix) FUNC_NAME_CAT(prefix,MMIO)
#else
#ifdef ACCEL_CP
#define FUNC_NAME(prefix) FUNC_NAME_CAT(prefix,CP)
#else
#error No accel type defined!
#endif
#endif


static void FUNC_NAME(RadeonInit3DEngine)(ScrnInfoPtr pScrn)
{
    RADEONInfoPtr  info       = RADEONPTR(pScrn);
    ACCEL_PREAMBLE();

    if (info->ChipFamily >= CHIP_FAMILY_R300) {
	/* Unimplemented */
    } else if ((info->ChipFamily == CHIP_FAMILY_RV250) || 
	       (info->ChipFamily == CHIP_FAMILY_RV280) || 
	       (info->ChipFamily == CHIP_FAMILY_RS300) || 
	       (info->ChipFamily == CHIP_FAMILY_R200)) {

	BEGIN_ACCEL(7);
        if (info->ChipFamily == CHIP_FAMILY_RS300) {
            OUT_ACCEL_REG(R200_SE_VAP_CNTL_STATUS, RADEON_TCL_BYPASS);
        } else {
            OUT_ACCEL_REG(R200_SE_VAP_CNTL_STATUS, 0);
        }
	OUT_ACCEL_REG(R200_PP_CNTL_X, 0);
	OUT_ACCEL_REG(R200_PP_TXMULTI_CTL_0, 0);
	OUT_ACCEL_REG(R200_SE_VTX_STATE_CNTL, 0);
	OUT_ACCEL_REG(R200_RE_CNTL, 0x0);
	/* XXX: correct?  Want it to be like RADEON_VTX_ST?_NONPARAMETRIC */
	OUT_ACCEL_REG(R200_SE_VTE_CNTL, R200_VTX_ST_DENORMALIZED);
	OUT_ACCEL_REG(R200_SE_VAP_CNTL, R200_VAP_FORCE_W_TO_ONE |
	    R200_VAP_VF_MAX_VTX_NUM);
	FINISH_ACCEL();
    } else {
	BEGIN_ACCEL(2);
        if ((info->ChipFamily == CHIP_FAMILY_RADEON) ||
            (info->ChipFamily == CHIP_FAMILY_RV200))
            OUT_ACCEL_REG(RADEON_SE_CNTL_STATUS, 0);
        else
            OUT_ACCEL_REG(RADEON_SE_CNTL_STATUS, RADEON_TCL_BYPASS);
	OUT_ACCEL_REG(RADEON_SE_COORD_FMT,
	    RADEON_VTX_XY_PRE_MULT_1_OVER_W0 |
	    RADEON_VTX_ST0_NONPARAMETRIC |
	    RADEON_VTX_ST1_NONPARAMETRIC |
	    RADEON_TEX1_W_ROUTING_USE_W0);
	FINISH_ACCEL();
    }

    BEGIN_ACCEL(3);
    OUT_ACCEL_REG(RADEON_RE_TOP_LEFT, 0);
    OUT_ACCEL_REG(RADEON_RE_WIDTH_HEIGHT, 0x07ff07ff);
    OUT_ACCEL_REG(RADEON_SE_CNTL, RADEON_DIFFUSE_SHADE_GOURAUD |
				  RADEON_BFACE_SOLID | 
				  RADEON_FFACE_SOLID |
				  RADEON_VTX_PIX_CENTER_OGL |
				  RADEON_ROUND_MODE_ROUND |
				  RADEON_ROUND_PREC_4TH_PIX);
    FINISH_ACCEL();
}

static Bool FUNC_NAME(R100SetupTexture)(
	ScrnInfoPtr pScrn,
	CARD32 format,
	CARD8 *src,
	int src_pitch,
	int width,
	int height,
	int flags)
{
    RADEONInfoPtr info = RADEONPTR(pScrn);
    CARD8 *dst;
    CARD32 tex_size = 0, txformat;
    int dst_pitch, offset, size, i, tex_bytepp;
    ACCEL_PREAMBLE();

    if ((width > 2048) || (height > 2048))
	return FALSE;

    txformat = RadeonGetTextureFormat(format);
    tex_bytepp = PICT_FORMAT_BPP(format) >> 3;

#if X_BYTE_ORDER == X_BIG_ENDIAN
    if (!RADEONSetupRenderByteswap(pScrn, tex_bytepp)) {
	xf86DrvMsg(pScrn->scrnIndex, X_ERROR, "%s: RADEONSetupRenderByteswap() "
		   "failed!\n", __func__);
	return FALSE;
    }
#endif

    dst_pitch = (width * tex_bytepp + 31) & ~31;
    size = dst_pitch * height;

    if (!AllocateLinear(pScrn, size))
	return FALSE;

    if (flags & XAA_RENDER_REPEAT) {
	txformat |= ATILog2(width) << RADEON_TXFORMAT_WIDTH_SHIFT;
	txformat |= ATILog2(height) << RADEON_TXFORMAT_HEIGHT_SHIFT;
    } else {
	tex_size = ((height - 1) << 16) | (width - 1);
	txformat |= RADEON_TXFORMAT_NON_POWER2;
    }

    offset = info->RenderTex->offset * pScrn->bitsPerPixel / 8;

    /* Upload texture to card.  Should use ImageWrite to avoid syncing. */
    i = height;
    dst = (CARD8*)(info->FB + offset);

    if (info->accel->NeedToSync)
	info->accel->Sync(pScrn);

    while(i--) {
	memcpy(dst, src, width * tex_bytepp);
	src += src_pitch;
	dst += dst_pitch;
    }

#if X_BYTE_ORDER == X_BIG_ENDIAN
    RADEONRestoreByteswap(info);
#endif

    BEGIN_ACCEL(5);
    OUT_ACCEL_REG(RADEON_PP_TXFORMAT_0, txformat);
    OUT_ACCEL_REG(RADEON_PP_TEX_SIZE_0, tex_size);
    OUT_ACCEL_REG(RADEON_PP_TEX_PITCH_0, dst_pitch - 32);
    OUT_ACCEL_REG(RADEON_PP_TXOFFSET_0, offset + info->fbLocation +
					pScrn->fbOffset);
    OUT_ACCEL_REG(RADEON_PP_TXFILTER_0, RADEON_MAG_FILTER_LINEAR |
					RADEON_MIN_FILTER_LINEAR |
					RADEON_CLAMP_S_WRAP |
					RADEON_CLAMP_T_WRAP);
    FINISH_ACCEL();

    return TRUE;
}

static Bool
FUNC_NAME(R100SetupForCPUToScreenAlphaTexture) (
	ScrnInfoPtr	pScrn,
	int		op,
	CARD16		red,
	CARD16		green,
	CARD16		blue,
	CARD16		alpha,
	CARD32		maskFormat,
	CARD32		dstFormat,
	CARD8		*alphaPtr,
	int		alphaPitch,
	int		width,
	int		height,
	int		flags
) 
{
    RADEONInfoPtr info = RADEONPTR(pScrn);
    CARD32 colorformat, srccolor, blend_cntl;
    ACCEL_PREAMBLE();

    blend_cntl = RadeonGetBlendCntl(op, dstFormat);
    if (blend_cntl == 0)
	return FALSE;

    if (!info->RenderInited3D)
	RadeonInit3DEngine(pScrn);

    if (!FUNC_NAME(R100SetupTexture)(pScrn, maskFormat, alphaPtr, alphaPitch,
				     width, height, flags))
	return FALSE;

    colorformat = RadeonGetColorFormat(dstFormat);

    srccolor = ((alpha & 0xff00) << 16) | ((red & 0xff00) << 8) | (blue >> 8) |
	(green & 0xff00);

    BEGIN_ACCEL(8);
    OUT_ACCEL_REG(RADEON_RB3D_CNTL, colorformat | RADEON_ALPHA_BLEND_ENABLE);
    OUT_ACCEL_REG(RADEON_RB3D_COLORPITCH, pScrn->displayWidth);
    OUT_ACCEL_REG(RADEON_PP_CNTL, RADEON_TEX_0_ENABLE |
				  RADEON_TEX_BLEND_0_ENABLE);
    OUT_ACCEL_REG(RADEON_PP_TFACTOR_0, srccolor);
    OUT_ACCEL_REG(RADEON_PP_TXCBLEND_0, RADEON_COLOR_ARG_A_TFACTOR_COLOR |
					RADEON_COLOR_ARG_B_T0_ALPHA);
    OUT_ACCEL_REG(RADEON_PP_TXABLEND_0, RADEON_ALPHA_ARG_A_TFACTOR_ALPHA |
					RADEON_ALPHA_ARG_B_T0_ALPHA);
    OUT_ACCEL_REG(RADEON_SE_VTX_FMT, RADEON_SE_VTX_FMT_XY |
				     RADEON_SE_VTX_FMT_ST0);
    OUT_ACCEL_REG(RADEON_RB3D_BLENDCNTL, blend_cntl);
    FINISH_ACCEL();

    return TRUE;
}


static Bool
FUNC_NAME(R100SetupForCPUToScreenTexture) (
	ScrnInfoPtr	pScrn,
	int		op,
	CARD32		srcFormat,
	CARD32		dstFormat,
	CARD8		*texPtr,
	int		texPitch,
	int		width,
	int		height,
	int		flags
)
{
    RADEONInfoPtr info = RADEONPTR(pScrn);
    CARD32 colorformat, blend_cntl;
    ACCEL_PREAMBLE();

    blend_cntl = RadeonGetBlendCntl(op, dstFormat);
    if (blend_cntl == 0)
	return FALSE;
    
    if (!info->RenderInited3D)
	RadeonInit3DEngine(pScrn);

    if (!FUNC_NAME(R100SetupTexture)(pScrn, srcFormat, texPtr, texPitch, width,
				     height, flags))
	return FALSE;

    colorformat = RadeonGetColorFormat(dstFormat);
    
    BEGIN_ACCEL(7);
    OUT_ACCEL_REG(RADEON_RB3D_CNTL, colorformat | RADEON_ALPHA_BLEND_ENABLE);
    OUT_ACCEL_REG(RADEON_RB3D_COLORPITCH, pScrn->displayWidth);
    OUT_ACCEL_REG(RADEON_PP_CNTL, RADEON_TEX_0_ENABLE |
				  RADEON_TEX_BLEND_0_ENABLE);
    if (srcFormat != PICT_a8)
	OUT_ACCEL_REG(RADEON_PP_TXCBLEND_0, RADEON_COLOR_ARG_C_T0_COLOR);
    else
	OUT_ACCEL_REG(RADEON_PP_TXCBLEND_0, RADEON_COLOR_ARG_C_ZERO);
    OUT_ACCEL_REG(RADEON_PP_TXABLEND_0, RADEON_ALPHA_ARG_C_T0_ALPHA);
    OUT_ACCEL_REG(RADEON_SE_VTX_FMT, RADEON_SE_VTX_FMT_XY |
				     RADEON_SE_VTX_FMT_ST0);
    OUT_ACCEL_REG(RADEON_RB3D_BLENDCNTL, blend_cntl);
    FINISH_ACCEL();

    return TRUE;
}


static void
FUNC_NAME(R100SubsequentCPUToScreenTexture) (
	ScrnInfoPtr	pScrn,
	int		dstx,
	int		dsty,
	int		srcx,
	int		srcy,
	int		width,
	int		height
)
{
    RADEONInfoPtr  info       = RADEONPTR(pScrn);
    int byteshift;
    CARD32 fboffset;
    float l, t, r, b, fl, fr, ft, fb;

    ACCEL_PREAMBLE();

    /* Note: we can't simply set up the 3D surface at the same location as the
     * front buffer, because the 2048x2048 limit on coordinates may be smaller
     * than the (MergedFB) screen.
     */ 
    byteshift = (pScrn->bitsPerPixel >> 4);
    fboffset = (info->fbLocation + pScrn->fbOffset +
		((pScrn->displayWidth * dsty + dstx) << byteshift)) & ~15;
    l = ((dstx << byteshift) % 16) >> byteshift;
    t = 0.0;
    r = width + l;
    b = height;
    fl = srcx;
    fr = srcx + width;
    ft = srcy;
    fb = srcy + height;

#ifdef ACCEL_CP
    BEGIN_RING(23);

    OUT_ACCEL_REG(RADEON_RB3D_COLOROFFSET, fboffset);

    OUT_RING(CP_PACKET3(RADEON_CP_PACKET3_3D_DRAW_IMMD, 17));
    /* RADEON_SE_VTX_FMT */
    OUT_RING(RADEON_CP_VC_FRMT_XY |
	     RADEON_CP_VC_FRMT_ST0);
    /* SE_VF_CNTL */
    OUT_RING(RADEON_CP_VC_CNTL_PRIM_TYPE_TRI_FAN |
	     RADEON_CP_VC_CNTL_PRIM_WALK_RING |
	     RADEON_CP_VC_CNTL_MAOS_ENABLE |
	     RADEON_CP_VC_CNTL_VTX_FMT_RADEON_MODE |
	     (4 << RADEON_CP_VC_CNTL_NUM_SHIFT));

    OUT_RING(F_TO_DW(l));
    OUT_RING(F_TO_DW(t));
    OUT_RING(F_TO_DW(fl));
    OUT_RING(F_TO_DW(ft));

    OUT_RING(F_TO_DW(r));
    OUT_RING(F_TO_DW(t));
    OUT_RING(F_TO_DW(fr));
    OUT_RING(F_TO_DW(ft));

    OUT_RING(F_TO_DW(r));
    OUT_RING(F_TO_DW(b));
    OUT_RING(F_TO_DW(fr));
    OUT_RING(F_TO_DW(fb));

    OUT_RING(F_TO_DW(l));
    OUT_RING(F_TO_DW(b));
    OUT_RING(F_TO_DW(fl));
    OUT_RING(F_TO_DW(fb));

    OUT_ACCEL_REG(RADEON_WAIT_UNTIL, RADEON_WAIT_3D_IDLECLEAN);

    ADVANCE_RING();
#else
    BEGIN_ACCEL(19);
    
    OUT_ACCEL_REG(RADEON_RB3D_COLOROFFSET, fboffset);

    OUT_ACCEL_REG(RADEON_SE_VF_CNTL, RADEON_VF_PRIM_TYPE_TRIANGLE_FAN |
				     RADEON_VF_PRIM_WALK_DATA |
				     RADEON_VF_RADEON_MODE |
				     (4 << RADEON_VF_NUM_VERTICES_SHIFT));
	
    OUT_ACCEL_REG(RADEON_SE_PORT_DATA0, F_TO_DW(l));
    OUT_ACCEL_REG(RADEON_SE_PORT_DATA0, F_TO_DW(t));
    OUT_ACCEL_REG(RADEON_SE_PORT_DATA0, F_TO_DW(fl));
    OUT_ACCEL_REG(RADEON_SE_PORT_DATA0, F_TO_DW(ft));

    OUT_ACCEL_REG(RADEON_SE_PORT_DATA0, F_TO_DW(r));
    OUT_ACCEL_REG(RADEON_SE_PORT_DATA0, F_TO_DW(t));
    OUT_ACCEL_REG(RADEON_SE_PORT_DATA0, F_TO_DW(fr));
    OUT_ACCEL_REG(RADEON_SE_PORT_DATA0, F_TO_DW(ft));

    OUT_ACCEL_REG(RADEON_SE_PORT_DATA0, F_TO_DW(r));
    OUT_ACCEL_REG(RADEON_SE_PORT_DATA0, F_TO_DW(b));
    OUT_ACCEL_REG(RADEON_SE_PORT_DATA0, F_TO_DW(fr));
    OUT_ACCEL_REG(RADEON_SE_PORT_DATA0, F_TO_DW(fb));

    OUT_ACCEL_REG(RADEON_SE_PORT_DATA0, F_TO_DW(l));
    OUT_ACCEL_REG(RADEON_SE_PORT_DATA0, F_TO_DW(b));
    OUT_ACCEL_REG(RADEON_SE_PORT_DATA0, F_TO_DW(fl));
    OUT_ACCEL_REG(RADEON_SE_PORT_DATA0, F_TO_DW(fb));

    OUT_ACCEL_REG(RADEON_WAIT_UNTIL, RADEON_WAIT_3D_IDLECLEAN);
    FINISH_ACCEL();
#endif

}

static Bool FUNC_NAME(R200SetupTexture)(
	ScrnInfoPtr pScrn,
	CARD32 format,
	CARD8 *src,
	int src_pitch,
	int width,
	int height,
	int flags)
{
    RADEONInfoPtr info = RADEONPTR(pScrn);
    CARD8 *dst;
    CARD32 tex_size = 0, txformat;
    int dst_pitch, offset, size, i, tex_bytepp;
    ACCEL_PREAMBLE();

    if ((width > 2048) || (height > 2048))
	return FALSE;

    txformat = RadeonGetTextureFormat(format);
    tex_bytepp = PICT_FORMAT_BPP(format) >> 3;

#if X_BYTE_ORDER == X_BIG_ENDIAN
    if (!RADEONSetupRenderByteswap(pScrn, tex_bytepp)) {
	xf86DrvMsg(pScrn->scrnIndex, X_ERROR, "%s: RADEONSetupRenderByteswap() "
		   "failed!\n", __func__);
	return FALSE;
    }
#endif

    dst_pitch = (width * tex_bytepp + 31) & ~31;
    size = dst_pitch * height;

    if (!AllocateLinear(pScrn, size))
	return FALSE;

    if (flags & XAA_RENDER_REPEAT) {
	txformat |= ATILog2(width) << R200_TXFORMAT_WIDTH_SHIFT;
	txformat |= ATILog2(height) << R200_TXFORMAT_HEIGHT_SHIFT;
    } else {
	tex_size = ((height - 1) << 16) | (width - 1);
	txformat |= RADEON_TXFORMAT_NON_POWER2;
    }

    offset = info->RenderTex->offset * pScrn->bitsPerPixel / 8;

    /* Upload texture to card.  Should use ImageWrite to avoid syncing. */
    i = height;
    dst = (CARD8*)(info->FB + offset);
    if (info->accel->NeedToSync)
	info->accel->Sync(pScrn);

    while(i--) {
	memcpy(dst, src, width * tex_bytepp);
	src += src_pitch;
	dst += dst_pitch;
    }

#if X_BYTE_ORDER == X_BIG_ENDIAN
    RADEONRestoreByteswap(info);
#endif

    BEGIN_ACCEL(6);
    OUT_ACCEL_REG(R200_PP_TXFORMAT_0, txformat);
    OUT_ACCEL_REG(R200_PP_TXFORMAT_X_0, 0);
    OUT_ACCEL_REG(R200_PP_TXSIZE_0, tex_size);
    OUT_ACCEL_REG(R200_PP_TXPITCH_0, dst_pitch - 32);
    OUT_ACCEL_REG(R200_PP_TXOFFSET_0, offset + info->fbLocation +
				      pScrn->fbOffset);
    OUT_ACCEL_REG(R200_PP_TXFILTER_0, R200_MAG_FILTER_NEAREST |
				      R200_MIN_FILTER_NEAREST |
				      R200_CLAMP_S_WRAP |
				      R200_CLAMP_T_WRAP);
    FINISH_ACCEL();

    return TRUE;
}

static Bool
FUNC_NAME(R200SetupForCPUToScreenAlphaTexture) (
	ScrnInfoPtr	pScrn,
	int		op,
	CARD16		red,
	CARD16		green,
	CARD16		blue,
	CARD16		alpha,
	CARD32		maskFormat,
	CARD32		dstFormat,
	CARD8		*alphaPtr,
	int		alphaPitch,
	int		width,
	int		height,
	int		flags
) 
{
    RADEONInfoPtr  info = RADEONPTR(pScrn);
    CARD32 colorformat, srccolor, blend_cntl;
    ACCEL_PREAMBLE();

    blend_cntl = RadeonGetBlendCntl(op, dstFormat);
    if (blend_cntl == 0)
	return FALSE;

    if (!info->RenderInited3D)
	RadeonInit3DEngine(pScrn);

    if (!FUNC_NAME(R200SetupTexture)(pScrn, maskFormat, alphaPtr, alphaPitch,
				     width, height, flags))
	return FALSE;

    colorformat = RadeonGetColorFormat(dstFormat);

    srccolor = ((alpha & 0xff00) << 16) | ((red & 0xff00) << 8) | (blue >> 8) |
	(green & 0xff00);

    BEGIN_ACCEL(11);
    OUT_ACCEL_REG(RADEON_RB3D_CNTL, colorformat | RADEON_ALPHA_BLEND_ENABLE);
    OUT_ACCEL_REG(RADEON_RB3D_COLORPITCH, pScrn->displayWidth);
    OUT_ACCEL_REG(RADEON_PP_CNTL, RADEON_TEX_0_ENABLE |
				  RADEON_TEX_BLEND_0_ENABLE);
    OUT_ACCEL_REG(R200_PP_TFACTOR_0, srccolor);
    OUT_ACCEL_REG(R200_PP_TXCBLEND_0, R200_TXC_ARG_A_TFACTOR_COLOR |
				      R200_TXC_ARG_B_R0_ALPHA);
    OUT_ACCEL_REG(R200_PP_TXCBLEND2_0, R200_TXC_OUTPUT_REG_R0);
    OUT_ACCEL_REG(R200_PP_TXABLEND_0, R200_TXA_ARG_A_TFACTOR_ALPHA |
				      R200_TXA_ARG_B_R0_ALPHA);
    OUT_ACCEL_REG(R200_PP_TXABLEND2_0, R200_TXA_OUTPUT_REG_R0);
    OUT_ACCEL_REG(R200_SE_VTX_FMT_0, 0);
    OUT_ACCEL_REG(R200_SE_VTX_FMT_1, (2 << R200_VTX_TEX0_COMP_CNT_SHIFT));
    OUT_ACCEL_REG(RADEON_RB3D_BLENDCNTL, blend_cntl);
    FINISH_ACCEL();

    return TRUE;
}

static Bool
FUNC_NAME(R200SetupForCPUToScreenTexture) (
	ScrnInfoPtr	pScrn,
	int		op,
	CARD32		srcFormat,
	CARD32		dstFormat,
	CARD8		*texPtr,
	int		texPitch,
	int		width,
	int		height,
	int		flags
)
{
    RADEONInfoPtr  info       = RADEONPTR(pScrn);
    CARD32 colorformat, blend_cntl;
    ACCEL_PREAMBLE();

    blend_cntl = RadeonGetBlendCntl(op, dstFormat);
    if (blend_cntl == 0)
	return FALSE;

    if (!info->RenderInited3D)
	RadeonInit3DEngine(pScrn);

    if (!FUNC_NAME(R200SetupTexture)(pScrn, srcFormat, texPtr, texPitch, width,
				     height, flags))
	return FALSE;

    colorformat = RadeonGetColorFormat(dstFormat);

    BEGIN_ACCEL(10);
    OUT_ACCEL_REG(RADEON_RB3D_CNTL, colorformat | RADEON_ALPHA_BLEND_ENABLE);
    OUT_ACCEL_REG(RADEON_RB3D_COLORPITCH, pScrn->displayWidth);
    OUT_ACCEL_REG(RADEON_PP_CNTL, RADEON_TEX_0_ENABLE |
				  RADEON_TEX_BLEND_0_ENABLE);
    if (srcFormat != PICT_a8)
	OUT_ACCEL_REG(R200_PP_TXCBLEND_0, R200_TXC_ARG_C_R0_COLOR);
    else
	OUT_ACCEL_REG(R200_PP_TXCBLEND_0, R200_TXC_ARG_C_ZERO);
    OUT_ACCEL_REG(R200_PP_TXCBLEND2_0, R200_TXC_OUTPUT_REG_R0);
    OUT_ACCEL_REG(R200_PP_TXABLEND_0, R200_TXA_ARG_C_R0_ALPHA);
    OUT_ACCEL_REG(R200_PP_TXABLEND2_0, R200_TXA_OUTPUT_REG_R0);
    OUT_ACCEL_REG(R200_SE_VTX_FMT_0, 0);
    OUT_ACCEL_REG(R200_SE_VTX_FMT_1, (2 << R200_VTX_TEX0_COMP_CNT_SHIFT));
    OUT_ACCEL_REG(RADEON_RB3D_BLENDCNTL, blend_cntl);
    FINISH_ACCEL();

    return TRUE;
}

static void
FUNC_NAME(R200SubsequentCPUToScreenTexture) (
	ScrnInfoPtr	pScrn,
	int		dstx,
	int		dsty,
	int		srcx,
	int		srcy,
	int		width,
	int		height
)
{
    RADEONInfoPtr  info       = RADEONPTR(pScrn);
    int byteshift;
    CARD32 fboffset;
    float l, t, r, b, fl, fr, ft, fb;
    ACCEL_PREAMBLE();

    /* Note: we can't simply set up the 3D surface at the same location as the
     * front buffer, because the 2048x2048 limit on coordinates may be smaller
     * than the (MergedFB) screen.
     */ 
    byteshift = (pScrn->bitsPerPixel >> 4);
    fboffset = (info->fbLocation + pScrn->fbOffset + ((pScrn->displayWidth *
	dsty + dstx) << byteshift)) & ~15;
    l = ((dstx << byteshift) % 16) >> byteshift;
    t = 0.0;
    r = width + l;
    b = height;
    fl = srcx;
    fr = srcx + width;
    ft = srcy;
    fb = srcy + height;

#ifdef ACCEL_CP
    BEGIN_RING(24);

    OUT_ACCEL_REG(RADEON_RB3D_COLOROFFSET, fboffset);

    OUT_RING(CP_PACKET3(R200_CP_PACKET3_3D_DRAW_IMMD_2, 16));
    /* RADEON_SE_VF_CNTL */
    OUT_RING(RADEON_CP_VC_CNTL_PRIM_TYPE_TRI_FAN |
	     RADEON_CP_VC_CNTL_PRIM_WALK_RING |
	     (4 << RADEON_CP_VC_CNTL_NUM_SHIFT));

    OUT_RING(F_TO_DW(l));
    OUT_RING(F_TO_DW(t));
    OUT_RING(F_TO_DW(fl));
    OUT_RING(F_TO_DW(ft));

    OUT_RING(F_TO_DW(r));
    OUT_RING(F_TO_DW(t));
    OUT_RING(F_TO_DW(fr));
    OUT_RING(F_TO_DW(ft));

    OUT_RING(F_TO_DW(r));
    OUT_RING(F_TO_DW(b));
    OUT_RING(F_TO_DW(fr));
    OUT_RING(F_TO_DW(fb));

    OUT_RING(F_TO_DW(l));
    OUT_RING(F_TO_DW(b));
    OUT_RING(F_TO_DW(fl));
    OUT_RING(F_TO_DW(fb));

    OUT_ACCEL_REG(RADEON_WAIT_UNTIL, RADEON_WAIT_3D_IDLECLEAN);

    ADVANCE_RING();
#else
    BEGIN_ACCEL(19);
    
    /* Note: we can't simply setup 3D surface at the same location as the front buffer,
       some apps may draw offscreen pictures out of the limitation of radeon 3D surface.
    */ 
    OUT_ACCEL_REG(RADEON_RB3D_COLOROFFSET, fboffset);

    OUT_ACCEL_REG(RADEON_SE_VF_CNTL, (RADEON_VF_PRIM_TYPE_QUAD_LIST |
				      RADEON_VF_PRIM_WALK_DATA |
				      4 << RADEON_VF_NUM_VERTICES_SHIFT));
	
    OUT_ACCEL_REG(RADEON_SE_PORT_DATA0, F_TO_DW(l));
    OUT_ACCEL_REG(RADEON_SE_PORT_DATA0, F_TO_DW(t));
    OUT_ACCEL_REG(RADEON_SE_PORT_DATA0, F_TO_DW(fl));
    OUT_ACCEL_REG(RADEON_SE_PORT_DATA0, F_TO_DW(ft));

    OUT_ACCEL_REG(RADEON_SE_PORT_DATA0, F_TO_DW(r));
    OUT_ACCEL_REG(RADEON_SE_PORT_DATA0, F_TO_DW(t));
    OUT_ACCEL_REG(RADEON_SE_PORT_DATA0, F_TO_DW(fr));
    OUT_ACCEL_REG(RADEON_SE_PORT_DATA0, F_TO_DW(ft));

    OUT_ACCEL_REG(RADEON_SE_PORT_DATA0, F_TO_DW(r));
    OUT_ACCEL_REG(RADEON_SE_PORT_DATA0, F_TO_DW(b));
    OUT_ACCEL_REG(RADEON_SE_PORT_DATA0, F_TO_DW(fr));
    OUT_ACCEL_REG(RADEON_SE_PORT_DATA0, F_TO_DW(fb));

    OUT_ACCEL_REG(RADEON_SE_PORT_DATA0, F_TO_DW(l));
    OUT_ACCEL_REG(RADEON_SE_PORT_DATA0, F_TO_DW(b));
    OUT_ACCEL_REG(RADEON_SE_PORT_DATA0, F_TO_DW(fl));
    OUT_ACCEL_REG(RADEON_SE_PORT_DATA0, F_TO_DW(fb));

    OUT_ACCEL_REG(RADEON_WAIT_UNTIL, RADEON_WAIT_3D_IDLECLEAN);

    FINISH_ACCEL();
#endif
}

#undef FUNC_NAME

