diff --git a/bootloader/source/arm7clear.s b/bootloader/source/arm7clear.s deleted file mode 100644 index 52343e36..00000000 --- a/bootloader/source/arm7clear.s +++ /dev/null @@ -1,70 +0,0 @@ -/*----------------------------------------------------------------- - - Copyright (C) 2005 Michael "Chishm" Chisholm - - This program is free software; you can redistribute it and/or - modify it under the terms of the GNU General Public License - as published by the Free Software Foundation; either version 2 - of the License, or (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program; if not, write to the Free Software - Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. - - If you use this code, please give due credit and email me about your - project at chishm@hotmail.com -------------------------------------------------------------------*/ - - .arm - .global arm7clearRAM - .type arm7clearRAM STT_FUNC -arm7clearRAM: - - push {r0-r9} - // clear exclusive IWRAM - // 0380:0000 to 0380:FFFF, total 64KiB - mov r0, #0 - mov r1, #0 - mov r2, #0 - mov r3, #0 - mov r4, #0 - mov r5, #0 - mov r6, #0 - mov r7, #0 - mov r8, #0x03800000 - sub r8, #0x00008000 - mov r9, #0x03800000 - orr r9, r9, #0x10000 -clear_IWRAM_loop: - stmia r8!, {r0, r1, r2, r3, r4, r5, r6, r7} - cmp r8, r9 - blt clear_IWRAM_loop - - // clear most of EWRAM - except after RAM end - 0xc000, which has the bootstub - mov r8, #0x02000000 - - ldr r9,=0x4004008 - ldr r9,[r9] - ands r9,r9,#0x8000 - bne dsi_mode - - mov r9, #0x02400000 - b ds_mode -dsi_mode: - mov r9, #0x03000000 -ds_mode: - sub r9, #0x0000c000 -clear_EWRAM_loop: - stmia r8!, {r0, r1, r2, r3, r4, r5, r6, r7} - cmp r8, r9 - blt clear_EWRAM_loop - - pop {r0-r9} - - bx lr - diff --git a/bootloader/source/boot.c b/bootloader/source/boot.c index d039e564..bffbd6a4 100644 --- a/bootloader/source/boot.c +++ b/bootloader/source/boot.c @@ -47,13 +47,13 @@ Helpful information: #define ARM7 #include #include +#include "dmaTwl.h" +#include "tonccpy.h" #include "fat.h" #include "dldi_patcher.h" #include "card.h" #include "boot.h" -void arm7clearRAM(); - //+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ // Important things #define TEMP_MEM 0x02FFD000 @@ -159,6 +159,15 @@ void passArgs_ARM7 (void) { +void memset_addrs_arm7(u32 start, u32 end) +{ + if (!dsiMode && !(REG_SCFG_EXT & BIT(16))) { + toncset((u32*)start, 0, ((int)end - (int)start)); + return; + } + dma_twlFill32(0, 0, (u32*)start, ((int)end - (int)start)); +} + /*------------------------------------------------------------------------- resetMemory_ARM7 Clears all of the NDS's RAM that is visible to the ARM7 @@ -182,6 +191,13 @@ void resetMemory_ARM7 (void) } REG_SOUNDCNT = 0; + REG_SNDCAP0CNT = 0; + REG_SNDCAP1CNT = 0; + + REG_SNDCAP0DAD = 0; + REG_SNDCAP0LEN = 0; + REG_SNDCAP1DAD = 0; + REG_SNDCAP1LEN = 0; //clear out ARM7 DMA channels and timers for (i=0; i<4; i++) { @@ -192,12 +208,15 @@ void resetMemory_ARM7 (void) TIMER_DATA(i) = 0; } - arm7clearRAM(); + memset_addrs_arm7(0x03800000 - 0x8000, 0x03800000 + (dsiMode ? 0xC000 : 0x10000)); // clear exclusive IWRAM + memset_addrs_arm7(0x02004000, (dsiMode ? 0x03000000 : 0x02400000) - 0xC000); // clear part of EWRAM - except before bootstub REG_IE = 0; REG_IF = ~0; - (*(vu32*)(0x04000000-4)) = 0; //IRQ_HANDLER ARM7 version - (*(vu32*)(0x04000000-8)) = ~0; //VBLANK_INTR_WAIT_FLAGS, ARM7 version + REG_AUXIE = 0; + REG_AUXIF = ~0; + *(vu32*)0x0380FFFC = 0; // IRQ_HANDLER ARM7 version + *(vu32*)0x0380FFF8 = 0; // VBLANK_INTR_WAIT_FLAGS, ARM7 version REG_POWERCNT = 1; //turn off power to stuff // Get settings location diff --git a/bootloader/source/dmaTwl.h b/bootloader/source/dmaTwl.h new file mode 100644 index 00000000..df34ca77 --- /dev/null +++ b/bootloader/source/dmaTwl.h @@ -0,0 +1,223 @@ +#pragma once + +typedef struct +{ + const void* src; // Source address; not used in fill mode + void* dst; // Destination address + u32 totalWordCount; // For auto-start mode without infinite repeat + u32 wordCount; // Number of words to transfer per start trigger + u32 blockInterval; // Sets prescaler and cycles of delay between physical blocks + u32 fillData; // For fill mode + u32 control; +} dma_twl_config_t; + +#define REG_NDMAGCNT (*(vu32*)0x04004100) + +#define NDMAGCNT_YIELD_CYCLES_0 (0 << 16) +#define NDMAGCNT_YIELD_CYCLES_1 (1 << 16) +#define NDMAGCNT_YIELD_CYCLES_2 (2 << 16) +#define NDMAGCNT_YIELD_CYCLES_4 (3 << 16) +#define NDMAGCNT_YIELD_CYCLES_8 (4 << 16) +#define NDMAGCNT_YIELD_CYCLES_16 (5 << 16) +#define NDMAGCNT_YIELD_CYCLES_32 (6 << 16) +#define NDMAGCNT_YIELD_CYCLES_64 (7 << 16) +#define NDMAGCNT_YIELD_CYCLES_128 (8 << 16) +#define NDMAGCNT_YIELD_CYCLES_256 (9 << 16) +#define NDMAGCNT_YIELD_CYCLES_512 (10 << 16) +#define NDMAGCNT_YIELD_CYCLES_1024 (11 << 16) +#define NDMAGCNT_YIELD_CYCLES_2048 (12 << 16) +#define NDMAGCNT_YIELD_CYCLES_4096 (13 << 16) +#define NDMAGCNT_YIELD_CYCLES_8192 (14 << 16) +#define NDMAGCNT_YIELD_CYCLES_16384 (15 << 16) + +#define NDMAGCNT_ARBITRATION_FIXED (0 << 31) +#define NDMAGCNT_ARBITRATION_ROUND_ROBIN (1 << 31) + +#define REG_NDMA0SAD (*(vu32*)0x04004104) +#define REG_NDMA0DAD (*(vu32*)0x04004108) +#define REG_NDMA0TCNT (*(vu32*)0x0400410C) +#define REG_NDMA0WCNT (*(vu32*)0x04004110) +#define REG_NDMA0BCNT (*(vu32*)0x04004114) +#define REG_NDMA0FDATA (*(vu32*)0x04004118) +#define REG_NDMA0CNT (*(vu32*)0x0400411C) + +#define REG_NDMA1SAD (*(vu32*)0x04004120) +#define REG_NDMA1DAD (*(vu32*)0x04004124) +#define REG_NDMA1TCNT (*(vu32*)0x04004128) +#define REG_NDMA1WCNT (*(vu32*)0x0400412C) +#define REG_NDMA1BCNT (*(vu32*)0x04004130) +#define REG_NDMA1FDATA (*(vu32*)0x04004134) +#define REG_NDMA1CNT (*(vu32*)0x04004138) + +#define REG_NDMA2SAD (*(vu32*)0x0400413C) +#define REG_NDMA2DAD (*(vu32*)0x04004140) +#define REG_NDMA2TCNT (*(vu32*)0x04004144) +#define REG_NDMA2WCNT (*(vu32*)0x04004148) +#define REG_NDMA2BCNT (*(vu32*)0x0400414C) +#define REG_NDMA2FDATA (*(vu32*)0x04004150) +#define REG_NDMA2CNT (*(vu32*)0x04004154) + +#define REG_NDMA3SAD (*(vu32*)0x04004158) +#define REG_NDMA3DAD (*(vu32*)0x0400415C) +#define REG_NDMA3TCNT (*(vu32*)0x04004160) +#define REG_NDMA3WCNT (*(vu32*)0x04004164) +#define REG_NDMA3BCNT (*(vu32*)0x04004168) +#define REG_NDMA3FDATA (*(vu32*)0x0400416C) +#define REG_NDMA3CNT (*(vu32*)0x04004170) + +#define NDMABCNT_INTERVAL(x) (x) +#define NDMABCNT_PRESCALER_1 (0 << 16) +#define NDMABCNT_PRESCALER_4 (1 << 16) +#define NDMABCNT_PRESCALER_16 (2 << 16) +#define NDMABCNT_PRESCALER_64 (3 << 16) + +#define NDMACNT_DST_MODE_INCREMENT (0 << 10) +#define NDMACNT_DST_MODE_DECREMENT (1 << 10) +#define NDMACNT_DST_MODE_FIXED (2 << 10) + +#define NDMACNT_DST_RELOAD (1 << 12) + +#define NDMACNT_SRC_MODE_INCREMENT (0 << 13) +#define NDMACNT_SRC_MODE_DECREMENT (1 << 13) +#define NDMACNT_SRC_MODE_FIXED (2 << 13) +#define NDMACNT_SRC_MODE_FILLDATA (3 << 13) + +#define NDMACNT_SRC_RELOAD (1 << 15) + +#define NDMACNT_PHYSICAL_COUNT_1 (0 << 16) +#define NDMACNT_PHYSICAL_COUNT_2 (1 << 16) +#define NDMACNT_PHYSICAL_COUNT_4 (2 << 16) +#define NDMACNT_PHYSICAL_COUNT_8 (3 << 16) +#define NDMACNT_PHYSICAL_COUNT_16 (4 << 16) +#define NDMACNT_PHYSICAL_COUNT_32 (5 << 16) +#define NDMACNT_PHYSICAL_COUNT_64 (6 << 16) +#define NDMACNT_PHYSICAL_COUNT_128 (7 << 16) +#define NDMACNT_PHYSICAL_COUNT_256 (8 << 16) +#define NDMACNT_PHYSICAL_COUNT_512 (9 << 16) +#define NDMACNT_PHYSICAL_COUNT_1024 (10 << 16) +#define NDMACNT_PHYSICAL_COUNT_2048 (11 << 16) +#define NDMACNT_PHYSICAL_COUNT_4096 (12 << 16) +#define NDMACNT_PHYSICAL_COUNT_8192 (13 << 16) +#define NDMACNT_PHYSICAL_COUNT_16384 (14 << 16) +#define NDMACNT_PHYSICAL_COUNT_32768 (15 << 16) + +#define NDMACNT_MODE_TIMER_0 (0 << 24) +#define NDMACNT_MODE_TIMER_1 (1 << 24) +#define NDMACNT_MODE_TIMER_2 (2 << 24) +#define NDMACNT_MODE_TIMER_3 (3 << 24) +#define NDMACNT_MODE_DS_SLOTA_ROM_XFER (4 << 24) +#define NDMACNT_MODE_DS_SLOTB_ROM_XFER (5 << 24) +#define NDMACNT_MODE_VBLANK (6 << 24) + +#ifdef LIBTWL_ARM9 + +#define NDMACNT_MODE_HBLANK (7 << 24) +#define NDMACNT_MODE_DISPLAY (8 << 24) +#define NDMACNT_MODE_MMEM_DISP_FIFO (9 << 24) +#define NDMACNT_MODE_GX_FIFO (10 << 24) +#define NDMACNT_MODE_CAMERA (11 << 24) + +#endif + +#ifdef LIBTWL_ARM7 + +#define NDMACNT_MODE_WIFI (7 << 24) +#define NDMACNT_MODE_SDMMC (8 << 24) +#define NDMACNT_MODE_SDIO (9 << 24) +#define NDMACNT_MODE_AES_IN (10 << 24) +#define NDMACNT_MODE_AES_OUT (11 << 24) +#define NDMACNT_MODE_MIC (12 << 24) + +#endif + +#define NDMACNT_MODE_IMMEDIATE (1 << 28) +#define NDMACNT_REPEAT_INFINITELY (1 << 29) +#define NDMACNT_IRQ (1 << 30) +#define NDMACNT_ENABLE (1 << 31) + +#ifdef __cplusplus +extern "C" { +#endif + +/// @brief Configures twl ndma to use fixed arbitration. +/// In this mode ndma0 has the highest and ndma3 the lowest priority, +/// similar to the nitro dma channels. Note that ndma0 has a lower +/// priority than nitro dma channel 3. When ndma channels are active +/// the dsp and cpu can not access the bus. +static inline void dma_twlSetFixedArbitration(void) +{ + REG_NDMAGCNT = NDMAGCNT_ARBITRATION_FIXED; +} + +/// @brief Configures twl ndma to use round robin arbitration. +/// In this mode nitro dma channels still have a higher priority, +/// but bus access is distributed between all ndma channels and +/// the dsp and cpu. +/// This is done in the order ndma0, ndma1, ndma2, ndma3, dsp/cpu. +/// Candidates that do not have any outstanding request are skipped, +/// and the dsp takes priority over the cpu (as usual). The amount +/// of cycles reserved for the dsp/cpu is configurable. +/// @param yieldCycles The number of cycles that will be yielded to the +/// dsp/cpu in the round robin schedule. When there is no request +/// outstanding the cycles will not be wasted. Should be one of +/// NDMAGCNT_YIELD_CYCLES_*. +static inline void dma_twlSetRoundRobinArbitration(u32 yieldCycles) +{ + REG_NDMAGCNT = NDMAGCNT_ARBITRATION_ROUND_ROBIN | yieldCycles; +} + +static inline void dma_twlSetParams(int dma, const dma_twl_config_t* config) +{ + vu32* channel = &(®_NDMA0SAD)[7 * dma]; + channel[0] = (u32)config->src; + channel[1] = (u32)config->dst; + channel[2] = config->totalWordCount; + channel[3] = config->wordCount; + channel[4] = config->blockInterval; + channel[5] = config->fillData; + channel[6] = config->control; +} + +static inline void dma_twlWait(int dma) +{ + vu32* cnt = &(®_NDMA0CNT)[7 * dma]; + while (*cnt & NDMACNT_ENABLE); +} + +static inline void dma_twlCopy32Async(int dma, const void* src, void* dst, u32 length) +{ + vu32* channel = &(®_NDMA0SAD)[7 * dma]; + channel[0] = (u32)src; //SAD + channel[1] = (u32)dst; //DAD + channel[3] = length >> 2; //WCNT + channel[4] = NDMABCNT_PRESCALER_1 | NDMABCNT_INTERVAL(0); //BCNT + channel[6] = NDMACNT_DST_MODE_INCREMENT | NDMACNT_SRC_MODE_INCREMENT | + NDMACNT_PHYSICAL_COUNT_1 | NDMACNT_MODE_IMMEDIATE | NDMACNT_ENABLE; +} + +static inline void dma_twlCopy32(int dma, const void* src, void* dst, u32 length) +{ + dma_twlCopy32Async(dma, src, dst, length); + dma_twlWait(dma); +} + +static inline void dma_twlFill32Async(int dma, u32 value, void* dst, u32 length) +{ + vu32* channel = &(®_NDMA0SAD)[7 * dma]; + channel[1] = (u32)dst; //DAD + channel[3] = length >> 2; //WCNT + channel[4] = NDMABCNT_PRESCALER_1 | NDMABCNT_INTERVAL(0); //BCNT + channel[5] = value; //FDATA + channel[6] = NDMACNT_DST_MODE_INCREMENT | NDMACNT_SRC_MODE_FILLDATA | + NDMACNT_PHYSICAL_COUNT_1 | NDMACNT_MODE_IMMEDIATE | NDMACNT_ENABLE; +} + +static inline void dma_twlFill32(int dma, u32 value, void* dst, u32 length) +{ + dma_twlFill32Async(dma, value, dst, length); + dma_twlWait(dma); +} + +#ifdef __cplusplus +} +#endif diff --git a/bootloader/source/tonccpy.c b/bootloader/source/tonccpy.c new file mode 100644 index 00000000..a51437ec --- /dev/null +++ b/bootloader/source/tonccpy.c @@ -0,0 +1,136 @@ +#include "tonccpy.h" +//# tonccpy.c + +//! VRAM-safe cpy. +/*! This version mimics memcpy in functionality, with + the benefit of working for VRAM as well. It is also + slightly faster than the original memcpy, but faster + implementations can be made. + \param dst Destination pointer. + \param src Source pointer. + \param size Fill-length in bytes. + \note The pointers and size need not be word-aligned. +*/ +void tonccpy(void *dst, const void *src, uint size) +{ + if(size==0 || dst==0 || src==0) + return; + + uint count; + u16 *dst16; // hword destination + u8 *src8; // byte source + + // Ideal case: copy by 4x words. Leaves tail for later. + if( ((u32)src|(u32)dst)%4==0 && size>=4) + { + u32 *src32= (u32*)src, *dst32= (u32*)dst; + + count= size/4; + uint tmp= count&3; + count /= 4; + + // Duff's Device, good friend! + switch(tmp) { + do { *dst32++ = *src32++; + case 3: *dst32++ = *src32++; + case 2: *dst32++ = *src32++; + case 1: *dst32++ = *src32++; + case 0: ; } while(count--); + } + + // Check for tail + size &= 3; + if(size == 0) + return; + + src8= (u8*)src32; + dst16= (u16*)dst32; + } + else // Unaligned. + { + uint dstOfs= (u32)dst&1; + src8= (u8*)src; + dst16= (u16*)(dst-dstOfs); + + // Head: 1 byte. + if(dstOfs != 0) + { + *dst16= (*dst16 & 0xFF) | *src8++<<8; + dst16++; + if(--size==0) + return; + } + } + + // Unaligned main: copy by 2x byte. + count= size/2; + while(count--) + { + *dst16++ = src8[0] | src8[1]<<8; + src8 += 2; + } + + // Tail: 1 byte. + if(size&1) + *dst16= (*dst16 &~ 0xFF) | *src8; +} +//# toncset.c + +//! VRAM-safe memset, internal routine. +/*! This version mimics memset in functionality, with + the benefit of working for VRAM as well. It is also + slightly faster than the original memset. + \param dst Destination pointer. + \param fill Word to fill with. + \param size Fill-length in bytes. + \note The \a dst pointer and \a size need not be + word-aligned. In the case of unaligned fills, \a fill + will be masked off to match the situation. +*/ +void __toncset(void *dst, u32 fill, uint size) +{ + if(size==0 || dst==0) + return; + + uint left= (u32)dst&3; + u32 *dst32= (u32*)(dst-left); + u32 count, mask; + + // Unaligned head. + if(left != 0) + { + // Adjust for very small stint. + if(left+size<4) + { + mask= BIT_MASK(size*8)<<(left*8); + *dst32= (*dst32 &~ mask) | (fill & mask); + return; + } + + mask= BIT_MASK(left*8); + *dst32= (*dst32 & mask) | (fill&~mask); + dst32++; + size -= 4-left; + } + + // Main stint. + count= size/4; + uint tmp= count&3; + count /= 4; + + switch(tmp) { + do { *dst32++ = fill; + case 3: *dst32++ = fill; + case 2: *dst32++ = fill; + case 1: *dst32++ = fill; + case 0: ; } while(count--); + } + + // Tail + size &= 3; + if(size) + { + mask= BIT_MASK(size*8); + *dst32= (*dst32 &~ mask) | (fill & mask); + } +} diff --git a/bootloader/source/tonccpy.h b/bootloader/source/tonccpy.h new file mode 100644 index 00000000..dd4267dc --- /dev/null +++ b/bootloader/source/tonccpy.h @@ -0,0 +1,43 @@ +//# Stuff you may not have yet. + +#ifndef TONCCPY_H +#define TONCCPY_H + + +#ifdef __cplusplus +extern "C" { +#endif + +#include + +typedef unsigned int uint; +#define BIT_MASK(len) ( (1<<(len))-1 ) +static inline u32 quad8(u16 x) { x |= x<<8; return x | x<<16; } + + +//# Declarations and inlines. + +void tonccpy(void *dst, const void *src, uint size); + +void __toncset(void *dst, u32 fill, uint size); +static inline void toncset(void *dst, u8 src, uint size); +static inline void toncset16(void *dst, u16 src, uint size); +static inline void toncset32(void *dst, u32 src, uint size); + + +//! VRAM-safe memset, byte version. Size in bytes. +static inline void toncset(void *dst, u8 src, uint size) +{ __toncset(dst, quad8(src), size); } + +//! VRAM-safe memset, halfword version. Size in hwords. +static inline void toncset16(void *dst, u16 src, uint size) +{ __toncset(dst, src|src<<16, size*2); } + +//! VRAM-safe memset, word version. Size in words. +static inline void toncset32(void *dst, u32 src, uint size) +{ __toncset(dst, src, size*4); } + +#ifdef __cplusplus +} +#endif +#endif