Skip to content

Commit

Permalink
Use NDMA to clear RAM and clear DSi registers
Browse files Browse the repository at this point in the history
RocketRobz committed Feb 27, 2024
1 parent 5a69aba commit d030ddc
Showing 5 changed files with 426 additions and 75 deletions.
70 changes: 0 additions & 70 deletions bootloader/source/arm7clear.s

This file was deleted.

29 changes: 24 additions & 5 deletions bootloader/source/boot.c
Original file line number Diff line number Diff line change
@@ -47,13 +47,13 @@ Helpful information:
#define ARM7
#include <nds/arm7/audio.h>
#include <nds/arm7/sdmmc.h>
#include "dmaTwl.h"
#include "tonccpy.h"
#include "fat.h"
#include "dldi_patcher.h"
#include "card.h"
#include "boot.h"

void arm7clearRAM();

//+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
// Important things
#define TEMP_MEM 0x02FFD000
@@ -159,6 +159,15 @@ void passArgs_ARM7 (void) {



void memset_addrs_arm7(u32 start, u32 end)
{
if (!dsiMode && !(REG_SCFG_EXT & BIT(16))) {
toncset((u32*)start, 0, ((int)end - (int)start));
return;
}
dma_twlFill32(0, 0, (u32*)start, ((int)end - (int)start));
}

/*-------------------------------------------------------------------------
resetMemory_ARM7
Clears all of the NDS's RAM that is visible to the ARM7
@@ -182,6 +191,13 @@ void resetMemory_ARM7 (void)
}

REG_SOUNDCNT = 0;
REG_SNDCAP0CNT = 0;
REG_SNDCAP1CNT = 0;

REG_SNDCAP0DAD = 0;
REG_SNDCAP0LEN = 0;
REG_SNDCAP1DAD = 0;
REG_SNDCAP1LEN = 0;

//clear out ARM7 DMA channels and timers
for (i=0; i<4; i++) {
@@ -192,12 +208,15 @@ void resetMemory_ARM7 (void)
TIMER_DATA(i) = 0;
}

arm7clearRAM();
memset_addrs_arm7(0x03800000 - 0x8000, 0x03800000 + (dsiMode ? 0xC000 : 0x10000)); // clear exclusive IWRAM
memset_addrs_arm7(0x02004000, (dsiMode ? 0x03000000 : 0x02400000) - 0xC000); // clear part of EWRAM - except before bootstub

REG_IE = 0;
REG_IF = ~0;
(*(vu32*)(0x04000000-4)) = 0; //IRQ_HANDLER ARM7 version
(*(vu32*)(0x04000000-8)) = ~0; //VBLANK_INTR_WAIT_FLAGS, ARM7 version
REG_AUXIE = 0;
REG_AUXIF = ~0;
*(vu32*)0x0380FFFC = 0; // IRQ_HANDLER ARM7 version
*(vu32*)0x0380FFF8 = 0; // VBLANK_INTR_WAIT_FLAGS, ARM7 version
REG_POWERCNT = 1; //turn off power to stuff

// Get settings location
223 changes: 223 additions & 0 deletions bootloader/source/dmaTwl.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,223 @@
#pragma once

typedef struct
{
const void* src; // Source address; not used in fill mode
void* dst; // Destination address
u32 totalWordCount; // For auto-start mode without infinite repeat
u32 wordCount; // Number of words to transfer per start trigger
u32 blockInterval; // Sets prescaler and cycles of delay between physical blocks
u32 fillData; // For fill mode
u32 control;
} dma_twl_config_t;

#define REG_NDMAGCNT (*(vu32*)0x04004100)

#define NDMAGCNT_YIELD_CYCLES_0 (0 << 16)
#define NDMAGCNT_YIELD_CYCLES_1 (1 << 16)
#define NDMAGCNT_YIELD_CYCLES_2 (2 << 16)
#define NDMAGCNT_YIELD_CYCLES_4 (3 << 16)
#define NDMAGCNT_YIELD_CYCLES_8 (4 << 16)
#define NDMAGCNT_YIELD_CYCLES_16 (5 << 16)
#define NDMAGCNT_YIELD_CYCLES_32 (6 << 16)
#define NDMAGCNT_YIELD_CYCLES_64 (7 << 16)
#define NDMAGCNT_YIELD_CYCLES_128 (8 << 16)
#define NDMAGCNT_YIELD_CYCLES_256 (9 << 16)
#define NDMAGCNT_YIELD_CYCLES_512 (10 << 16)
#define NDMAGCNT_YIELD_CYCLES_1024 (11 << 16)
#define NDMAGCNT_YIELD_CYCLES_2048 (12 << 16)
#define NDMAGCNT_YIELD_CYCLES_4096 (13 << 16)
#define NDMAGCNT_YIELD_CYCLES_8192 (14 << 16)
#define NDMAGCNT_YIELD_CYCLES_16384 (15 << 16)

#define NDMAGCNT_ARBITRATION_FIXED (0 << 31)
#define NDMAGCNT_ARBITRATION_ROUND_ROBIN (1 << 31)

#define REG_NDMA0SAD (*(vu32*)0x04004104)
#define REG_NDMA0DAD (*(vu32*)0x04004108)
#define REG_NDMA0TCNT (*(vu32*)0x0400410C)
#define REG_NDMA0WCNT (*(vu32*)0x04004110)
#define REG_NDMA0BCNT (*(vu32*)0x04004114)
#define REG_NDMA0FDATA (*(vu32*)0x04004118)
#define REG_NDMA0CNT (*(vu32*)0x0400411C)

#define REG_NDMA1SAD (*(vu32*)0x04004120)
#define REG_NDMA1DAD (*(vu32*)0x04004124)
#define REG_NDMA1TCNT (*(vu32*)0x04004128)
#define REG_NDMA1WCNT (*(vu32*)0x0400412C)
#define REG_NDMA1BCNT (*(vu32*)0x04004130)
#define REG_NDMA1FDATA (*(vu32*)0x04004134)
#define REG_NDMA1CNT (*(vu32*)0x04004138)

#define REG_NDMA2SAD (*(vu32*)0x0400413C)
#define REG_NDMA2DAD (*(vu32*)0x04004140)
#define REG_NDMA2TCNT (*(vu32*)0x04004144)
#define REG_NDMA2WCNT (*(vu32*)0x04004148)
#define REG_NDMA2BCNT (*(vu32*)0x0400414C)
#define REG_NDMA2FDATA (*(vu32*)0x04004150)
#define REG_NDMA2CNT (*(vu32*)0x04004154)

#define REG_NDMA3SAD (*(vu32*)0x04004158)
#define REG_NDMA3DAD (*(vu32*)0x0400415C)
#define REG_NDMA3TCNT (*(vu32*)0x04004160)
#define REG_NDMA3WCNT (*(vu32*)0x04004164)
#define REG_NDMA3BCNT (*(vu32*)0x04004168)
#define REG_NDMA3FDATA (*(vu32*)0x0400416C)
#define REG_NDMA3CNT (*(vu32*)0x04004170)

#define NDMABCNT_INTERVAL(x) (x)
#define NDMABCNT_PRESCALER_1 (0 << 16)
#define NDMABCNT_PRESCALER_4 (1 << 16)
#define NDMABCNT_PRESCALER_16 (2 << 16)
#define NDMABCNT_PRESCALER_64 (3 << 16)

#define NDMACNT_DST_MODE_INCREMENT (0 << 10)
#define NDMACNT_DST_MODE_DECREMENT (1 << 10)
#define NDMACNT_DST_MODE_FIXED (2 << 10)

#define NDMACNT_DST_RELOAD (1 << 12)

#define NDMACNT_SRC_MODE_INCREMENT (0 << 13)
#define NDMACNT_SRC_MODE_DECREMENT (1 << 13)
#define NDMACNT_SRC_MODE_FIXED (2 << 13)
#define NDMACNT_SRC_MODE_FILLDATA (3 << 13)

#define NDMACNT_SRC_RELOAD (1 << 15)

#define NDMACNT_PHYSICAL_COUNT_1 (0 << 16)
#define NDMACNT_PHYSICAL_COUNT_2 (1 << 16)
#define NDMACNT_PHYSICAL_COUNT_4 (2 << 16)
#define NDMACNT_PHYSICAL_COUNT_8 (3 << 16)
#define NDMACNT_PHYSICAL_COUNT_16 (4 << 16)
#define NDMACNT_PHYSICAL_COUNT_32 (5 << 16)
#define NDMACNT_PHYSICAL_COUNT_64 (6 << 16)
#define NDMACNT_PHYSICAL_COUNT_128 (7 << 16)
#define NDMACNT_PHYSICAL_COUNT_256 (8 << 16)
#define NDMACNT_PHYSICAL_COUNT_512 (9 << 16)
#define NDMACNT_PHYSICAL_COUNT_1024 (10 << 16)
#define NDMACNT_PHYSICAL_COUNT_2048 (11 << 16)
#define NDMACNT_PHYSICAL_COUNT_4096 (12 << 16)
#define NDMACNT_PHYSICAL_COUNT_8192 (13 << 16)
#define NDMACNT_PHYSICAL_COUNT_16384 (14 << 16)
#define NDMACNT_PHYSICAL_COUNT_32768 (15 << 16)

#define NDMACNT_MODE_TIMER_0 (0 << 24)
#define NDMACNT_MODE_TIMER_1 (1 << 24)
#define NDMACNT_MODE_TIMER_2 (2 << 24)
#define NDMACNT_MODE_TIMER_3 (3 << 24)
#define NDMACNT_MODE_DS_SLOTA_ROM_XFER (4 << 24)
#define NDMACNT_MODE_DS_SLOTB_ROM_XFER (5 << 24)
#define NDMACNT_MODE_VBLANK (6 << 24)

#ifdef LIBTWL_ARM9

#define NDMACNT_MODE_HBLANK (7 << 24)
#define NDMACNT_MODE_DISPLAY (8 << 24)
#define NDMACNT_MODE_MMEM_DISP_FIFO (9 << 24)
#define NDMACNT_MODE_GX_FIFO (10 << 24)
#define NDMACNT_MODE_CAMERA (11 << 24)

#endif

#ifdef LIBTWL_ARM7

#define NDMACNT_MODE_WIFI (7 << 24)
#define NDMACNT_MODE_SDMMC (8 << 24)
#define NDMACNT_MODE_SDIO (9 << 24)
#define NDMACNT_MODE_AES_IN (10 << 24)
#define NDMACNT_MODE_AES_OUT (11 << 24)
#define NDMACNT_MODE_MIC (12 << 24)

#endif

#define NDMACNT_MODE_IMMEDIATE (1 << 28)
#define NDMACNT_REPEAT_INFINITELY (1 << 29)
#define NDMACNT_IRQ (1 << 30)
#define NDMACNT_ENABLE (1 << 31)

#ifdef __cplusplus
extern "C" {
#endif

/// @brief Configures twl ndma to use fixed arbitration.
/// In this mode ndma0 has the highest and ndma3 the lowest priority,
/// similar to the nitro dma channels. Note that ndma0 has a lower
/// priority than nitro dma channel 3. When ndma channels are active
/// the dsp and cpu can not access the bus.
static inline void dma_twlSetFixedArbitration(void)
{
REG_NDMAGCNT = NDMAGCNT_ARBITRATION_FIXED;
}

/// @brief Configures twl ndma to use round robin arbitration.
/// In this mode nitro dma channels still have a higher priority,
/// but bus access is distributed between all ndma channels and
/// the dsp and cpu.
/// This is done in the order ndma0, ndma1, ndma2, ndma3, dsp/cpu.
/// Candidates that do not have any outstanding request are skipped,
/// and the dsp takes priority over the cpu (as usual). The amount
/// of cycles reserved for the dsp/cpu is configurable.
/// @param yieldCycles The number of cycles that will be yielded to the
/// dsp/cpu in the round robin schedule. When there is no request
/// outstanding the cycles will not be wasted. Should be one of
/// NDMAGCNT_YIELD_CYCLES_*.
static inline void dma_twlSetRoundRobinArbitration(u32 yieldCycles)
{
REG_NDMAGCNT = NDMAGCNT_ARBITRATION_ROUND_ROBIN | yieldCycles;
}

static inline void dma_twlSetParams(int dma, const dma_twl_config_t* config)
{
vu32* channel = &(&REG_NDMA0SAD)[7 * dma];
channel[0] = (u32)config->src;
channel[1] = (u32)config->dst;
channel[2] = config->totalWordCount;
channel[3] = config->wordCount;
channel[4] = config->blockInterval;
channel[5] = config->fillData;
channel[6] = config->control;
}

static inline void dma_twlWait(int dma)
{
vu32* cnt = &(&REG_NDMA0CNT)[7 * dma];
while (*cnt & NDMACNT_ENABLE);
}

static inline void dma_twlCopy32Async(int dma, const void* src, void* dst, u32 length)
{
vu32* channel = &(&REG_NDMA0SAD)[7 * dma];
channel[0] = (u32)src; //SAD
channel[1] = (u32)dst; //DAD
channel[3] = length >> 2; //WCNT
channel[4] = NDMABCNT_PRESCALER_1 | NDMABCNT_INTERVAL(0); //BCNT
channel[6] = NDMACNT_DST_MODE_INCREMENT | NDMACNT_SRC_MODE_INCREMENT |
NDMACNT_PHYSICAL_COUNT_1 | NDMACNT_MODE_IMMEDIATE | NDMACNT_ENABLE;
}

static inline void dma_twlCopy32(int dma, const void* src, void* dst, u32 length)
{
dma_twlCopy32Async(dma, src, dst, length);
dma_twlWait(dma);
}

static inline void dma_twlFill32Async(int dma, u32 value, void* dst, u32 length)
{
vu32* channel = &(&REG_NDMA0SAD)[7 * dma];
channel[1] = (u32)dst; //DAD
channel[3] = length >> 2; //WCNT
channel[4] = NDMABCNT_PRESCALER_1 | NDMABCNT_INTERVAL(0); //BCNT
channel[5] = value; //FDATA
channel[6] = NDMACNT_DST_MODE_INCREMENT | NDMACNT_SRC_MODE_FILLDATA |
NDMACNT_PHYSICAL_COUNT_1 | NDMACNT_MODE_IMMEDIATE | NDMACNT_ENABLE;
}

static inline void dma_twlFill32(int dma, u32 value, void* dst, u32 length)
{
dma_twlFill32Async(dma, value, dst, length);
dma_twlWait(dma);
}

#ifdef __cplusplus
}
#endif
136 changes: 136 additions & 0 deletions bootloader/source/tonccpy.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,136 @@
#include "tonccpy.h"
//# tonccpy.c

//! VRAM-safe cpy.
/*! This version mimics memcpy in functionality, with
the benefit of working for VRAM as well. It is also
slightly faster than the original memcpy, but faster
implementations can be made.
\param dst Destination pointer.
\param src Source pointer.
\param size Fill-length in bytes.
\note The pointers and size need not be word-aligned.
*/
void tonccpy(void *dst, const void *src, uint size)
{
if(size==0 || dst==0 || src==0)
return;

uint count;
u16 *dst16; // hword destination
u8 *src8; // byte source

// Ideal case: copy by 4x words. Leaves tail for later.
if( ((u32)src|(u32)dst)%4==0 && size>=4)
{
u32 *src32= (u32*)src, *dst32= (u32*)dst;

count= size/4;
uint tmp= count&3;
count /= 4;

// Duff's Device, good friend!
switch(tmp) {
do { *dst32++ = *src32++;
case 3: *dst32++ = *src32++;
case 2: *dst32++ = *src32++;
case 1: *dst32++ = *src32++;
case 0: ; } while(count--);
}

// Check for tail
size &= 3;
if(size == 0)
return;

src8= (u8*)src32;
dst16= (u16*)dst32;
}
else // Unaligned.
{
uint dstOfs= (u32)dst&1;
src8= (u8*)src;
dst16= (u16*)(dst-dstOfs);

// Head: 1 byte.
if(dstOfs != 0)
{
*dst16= (*dst16 & 0xFF) | *src8++<<8;
dst16++;
if(--size==0)
return;
}
}

// Unaligned main: copy by 2x byte.
count= size/2;
while(count--)
{
*dst16++ = src8[0] | src8[1]<<8;
src8 += 2;
}

// Tail: 1 byte.
if(size&1)
*dst16= (*dst16 &~ 0xFF) | *src8;
}
//# toncset.c

//! VRAM-safe memset, internal routine.
/*! This version mimics memset in functionality, with
the benefit of working for VRAM as well. It is also
slightly faster than the original memset.
\param dst Destination pointer.
\param fill Word to fill with.
\param size Fill-length in bytes.
\note The \a dst pointer and \a size need not be
word-aligned. In the case of unaligned fills, \a fill
will be masked off to match the situation.
*/
void __toncset(void *dst, u32 fill, uint size)
{
if(size==0 || dst==0)
return;

uint left= (u32)dst&3;
u32 *dst32= (u32*)(dst-left);
u32 count, mask;

// Unaligned head.
if(left != 0)
{
// Adjust for very small stint.
if(left+size<4)
{
mask= BIT_MASK(size*8)<<(left*8);
*dst32= (*dst32 &~ mask) | (fill & mask);
return;
}

mask= BIT_MASK(left*8);
*dst32= (*dst32 & mask) | (fill&~mask);
dst32++;
size -= 4-left;
}

// Main stint.
count= size/4;
uint tmp= count&3;
count /= 4;

switch(tmp) {
do { *dst32++ = fill;
case 3: *dst32++ = fill;
case 2: *dst32++ = fill;
case 1: *dst32++ = fill;
case 0: ; } while(count--);
}

// Tail
size &= 3;
if(size)
{
mask= BIT_MASK(size*8);
*dst32= (*dst32 &~ mask) | (fill & mask);
}
}
43 changes: 43 additions & 0 deletions bootloader/source/tonccpy.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
//# Stuff you may not have yet.

#ifndef TONCCPY_H
#define TONCCPY_H


#ifdef __cplusplus
extern "C" {
#endif

#include <nds/ndstypes.h>

typedef unsigned int uint;
#define BIT_MASK(len) ( (1<<(len))-1 )
static inline u32 quad8(u16 x) { x |= x<<8; return x | x<<16; }


//# Declarations and inlines.

void tonccpy(void *dst, const void *src, uint size);

void __toncset(void *dst, u32 fill, uint size);
static inline void toncset(void *dst, u8 src, uint size);
static inline void toncset16(void *dst, u16 src, uint size);
static inline void toncset32(void *dst, u32 src, uint size);


//! VRAM-safe memset, byte version. Size in bytes.
static inline void toncset(void *dst, u8 src, uint size)
{ __toncset(dst, quad8(src), size); }

//! VRAM-safe memset, halfword version. Size in hwords.
static inline void toncset16(void *dst, u16 src, uint size)
{ __toncset(dst, src|src<<16, size*2); }

//! VRAM-safe memset, word version. Size in words.
static inline void toncset32(void *dst, u32 src, uint size)
{ __toncset(dst, src, size*4); }

#ifdef __cplusplus
}
#endif
#endif

0 comments on commit d030ddc

Please sign in to comment.