src/newlib/libgloss/arm/cpu-init/rdimon-aem.S - stadia-controller/gcc-arm-none-eabi - Git at Google

 /* Copyright (c) 2005-2013 ARM Ltd.  All rights reserved.

  Redistribution and use in source and binary forms, with or without
  modification, are permitted provided that the following conditions
  are met:
  1. Redistributions of source code must retain the above copyright
     notice, this list of conditions and the following disclaimer.
  2. Redistributions in binary form must reproduce the above copyright
     notice, this list of conditions and the following disclaimer in the
     documentation and/or other materials provided with the distribution.
  3. The name of the company may not be used to endorse or promote
     products derived from this software without specific prior written
     permission.

  THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
  WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
  TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
  PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */

 /* This file gives a basic initialisation of a Cortex-A series core.  It is
    the bare minimum required to get Cortex-A core running with a semihosting
    interface.

    It sets up a basic 1:1 phsyical address to virtual address mapping;
    turns the MMU on; enables branch prediction; activates any integrated
    caches; enables the Advanced SIMD and VFP co-processors; and installs
    basic exception handlers.

    It does not handle peripherals, and assumes all memory is Normal.

    It does not change processor state from the startup privilege and security
    level.

    This has only been tested to work in ARM state.

    By default it assumes exception vectors are located from address 0.
    However, if this is not true they can be moved by defining the
    _rdimon_vector_base symbol.  For example if you have HIVECS enabled you
    may pass --defsym _rdimon_vector_base=0xffff0000 on the linker command
    line.  */

    /* __ARM_ARCH_PROFILE is defined from GCC 4.8 onwards, however __ARM_ARCH_7A
 	has been defined since 4.2 onwards, which is when v7-a support was added
 	and hence 'A' profile support was added in the compiler.  Allow for this
 	file to be built with older compilers.  */
 #if defined(__ARM_ARCH_7A__) || (__ARM_ARCH_PROFILE == 'A')
     .syntax	unified
     .arch	armv7-a
     .arm

     @ CPU Initialisation
     .globl	_rdimon_hw_init_hook
     .type	_rdimon_hw_init_hook, %function

 _rdimon_hw_init_hook:
     @ Only run the code on CPU 0 - otherwise spin
     mrc         15, 0, r4, cr0, cr0, 5  @ Read MPIDR
     ands        r4, r4, #15
 spin:
     bne spin

     mov         r10, lr			@ Save LR for final return

 #ifdef __ARMEB__
     @ Setup for Big Endian
     setend      be
     mrc         15, 0, r4, cr1, cr0, 0  @ Read SCTLR
     orr         r4, r4, #(1<<25)        @ Switch to Big Endian (Set SCTLR.EE)
     mcr         15, 0, r4, cr1, cr0, 0  @ Write SCTLR
 #else
     @ Setup for Little Endian
     setend      le
     mrc         15, 0, r4, cr1, cr0, 0  @ Read SCTLR
     bic         r4, r4, #(1<<25)        @ Switch to LE (unset SCTLR.EE)
     mcr         15, 0, r4, cr1, cr0, 0  @ Write SCTLR
 #endif

     bl          is_a15_a7

     @ For Cortex-A15 and Cortex-A7 only:
     @ Write zero into the ACTLR to turn everything on.
     itt		eq
     moveq       r4, #0
     mcreq       15, 0, r4, c1, c0, 1
     isb

     @ For Cortex-A15 and Cortex-A7 only:
     @ Set ACTLR:SMP bit before enabling the caches and MMU,
     @ or performing any cache and TLB maintenance operations.
     ittt	eq
     mrceq       15, 0, r4, c1, c0, 1    @ Read ACTLR
     orreq       r4, r4, #(1<<6)         @ Enable ACTLR:SMP
     mcreq       15, 0, r4, c1, c0, 1    @ Write ACTLR
     isb

     @ Setup for exceptions being taken to Thumb/ARM state
     mrc         15, 0, r4, cr1, cr0, 0	@ Read SCTLR
 #if defined(__thumb__)
     orr         r4, r4, #(1 << 30)	@ Enable SCTLR.TE
 #else
     bic         r4, r4, #(1 << 30)      @ Disable SCTLR.TE
 #endif
     mcr         15, 0, r4, cr1, cr0, 0  @ Write SCTLR

     bl          __reset_caches

     mrc         15, 0, r4, cr1, cr0, 0  @ Read SCTLR
     orr         r4, r4, #(1<<22)        @ Enable unaligned mode
     bic         r4, r4, #2              @ Disable alignment faults
     bic         r4, r4, #1              @ Disable MMU
     mcr         15, 0, r4, cr1, cr0, 0  @ Write SCTLR

     mov         r4, #0
     mcr         15, 0, r4, cr8, cr7, 0  @ Write TLBIALL - Invaliidate unified
                                         @ TLB
     @ Setup MMU Primary table P=V mapping.
     mvn         r4, #0
     mcr         15, 0, r4, cr3, cr0, 0  @ Write DACR

     mov         r4, #0                  @ Always use TTBR0, no LPAE
     mcr         15, 0, r4, cr2, cr0, 2  @ Write TTBCR
     adr         r4, page_table_addr	@ Load the base for vectors
     ldr         r4, [r4]
     mrc         p15, 0, r0, c0, c0, 5   @ read MPIDR
     tst         r0, #0x80000000         @ bis[31]
     @ Set page table flags - there are two page table flag formats for the
     @ architecture.  For systems without multiprocessor extensions we use 0x1
     @ which is Inner cacheable/Outer non-cacheable.  For systems with
     @ multiprocessor extensions we use 0x59 which is Inner/Outer write-back,
     @ no write-allocate, and cacheable.  See the ARMARM-v7AR for more details.
     it          ne
     addne       r4, r4, #0x58
     add         r4, r4, #1

     mcr         15, 0, r4, cr2, cr0, 0  @ Write TTBR0

     mov         r0, #34 @ 0x22          @ TR0 and TR1 - normal memory
     orr         r0, r0, #(1 << 19)      @ Shareable
     mcr         15, 0, r0, cr10, cr2, 0 @ Write PRRR
     movw        r0, #0x33
     movt        r0, #0x33
     mcr         15, 0, r0, cr10, cr2, 1 @ Write NMRR
     mrc         15, 0, r0, cr1, cr0, 0  @ Read SCTLR
     bic         r0, r0, #(1 << 28)      @ Clear TRE bit
     mcr         15, 0, r0, cr1, cr0, 0  @ Write SCTLR

     @ Now install the vector code - we move the Vector code from where it is
     @ in the image to be based at _rdimon_vector_base.  We have to do this copy
     @ as the code is all PC-relative.  We actually cheat and do a BX <reg> so
     @ that we are at a known address relatively quickly and have to move as
     @ little code as possible.
     mov         r7, #(VectorCode_Limit - VectorCode)
     adr         r5, VectorCode
     adr         r6, vector_base_addr	@ Load the base for vectors
     ldr         r6, [r6]

 copy_loop:                              @ Do the copy
     ldr         r4, [r5], #4
     str         r4, [r6], #4
     subs        r7, r7, #4
     bne         copy_loop

     mrc         15, 0, r4, cr1, cr0, 0  @ Read SCTLR
     bic         r4, r4, #0x1000         @ Disable I Cache
     bic         r4, r4, #4              @ Disable D Cache
     orr         r4, r4, #1              @ Enable MMU
     bic         r4, r4, #(1 << 28)      @ Clear TRE bit
     mcr         15, 0, r4, cr1, cr0, 0  @ Write SCTLR
     mrc         15, 0, r4, cr1, cr0, 2  @ Read CPACR
     orr         r4, r4, #0x00f00000     @ Turn on VFP Co-procs
     bic         r4, r4, #0x80000000     @ Clear ASEDIS bit
     mcr         15, 0, r4, cr1, cr0, 2  @ Write CPACR
     isb
     mov         r4, #0
     mcr         15, 0, r4, cr7, cr5, 4  @ Flush prefetch buffer
     mrc         15, 0, r4, cr1, cr0, 2  @ Read CPACR
     ubfx        r4, r4, #20, #4		@ Extract bits [20, 23)
     cmp         r4, #0xf		@ If not all set then the CPU does not
     itt		eq			@ have FP or Advanced SIMD.
     moveq       r4, #0x40000000		@ Enable FP and Advanced SIMD
     mcreq       10, 7, r4, cr8, cr0, 0  @ vmsr  fpexc, r4
 skip_vfp_enable:
     bl          __enable_caches         @ Turn caches on
     bx		r10                     @ Return to CRT startup routine

     @ This enable us to be more precise about which caches we want
 init_cpu_client_enable_dcache:
 init_cpu_client_enable_icache:
     mov         r0, #1
     bx          lr

 vector_base_addr:
     .word       _rdimon_vector_base
     .weak       _rdimon_vector_base
 page_table_addr:
     .word       page_tables

     @ Vector code - must be PIC and in ARM state.
 VectorCode:
     b           vector_reset
     b           vector_undef
     b           vector_swi
     b           vector_prefetch
     b           vector_dataabt
     b           vector_reserved
     b           vector_irq
     b           vector_fiq

 vector_reset:
     adr         sp, vector_sp_base
     push        {r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, sl, fp, ip, lr}
     mov         r4, #0
     b           vector_common
 vector_undef:
     adr         sp, vector_sp_base
     push        {r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, sl, fp, ip, lr}
     mov         r4, #1
     b           vector_common
 vector_swi:
     adr         sp, vector_sp_base
     push        {r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, sl, fp, ip, lr}
     mov         r4, #2
     b           vector_common
 vector_prefetch:
     adr         sp, vector_sp_base
     push        {r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, sl, fp, ip, lr}
     mov         r4, #3
     b           vector_common
 vector_dataabt:
     adr         sp, vector_sp_base
     push        {r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, sl, fp, ip, lr}
     mov         r4, #4
     b           vector_common
 vector_reserved:
     adr         sp, vector_sp_base
     push        {r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, sl, fp, ip, lr}
     mov         r4, #5
     b           vector_common
 vector_irq:
     adr         sp, vector_sp_base
     push        {r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, sl, fp, ip, lr}
     mov         r4, #6
     b           vector_common
 vector_fiq:
     adr         sp, vector_sp_base
     push        {r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, sl, fp, ip, lr}
     mov         r4, #7
     b           vector_common
 vector_common:
     adr         r1, vector_common_adr   @ Find where we're going to
     ldr         r1, [r1]
     bx          r1                      @ And branch there
 vector_common_adr:
    .word        vector_common_2         @ Common handling code

                                         @ Vector stack
    .p2align       3                       @ Align to 8 byte boundary boundary to
 					@ keep ABI compatibility
    .fill        32, 4, 0                @ 32-entry stack is enough for vector
 					@ handlers.
 vector_sp_base:
 VectorCode_Limit:
     @ End of PIC code for vectors

     @ Common Handling of vectors
     .type	vector_common_2, %function
 vector_common_2:
     mrs         r1, APSR
     mrs         r2, SPSR
     push        {r1, r2}                @ Save PSRs

     @ Output the vector we have caught
     bl          out_nl
     adr         r0, which_vector
     bl          out_string
     adr         r0, vector_names
     mov         r1, #11
     mla         r0, r4, r1, r0
     bl          out_string
     bl          out_nl

     @ Dump the registers
     adrl        r6, register_names
     mov         r7, #0
 dump_r_loop:
     mov         r0, r6
     bl          out_string
     add         r6, r6, #6
     ldr         r0, [sp, r7, lsl #2]
     bl          out_word
     bl          out_nl
     add         r7, r7, #1
     cmp         r7, #16
     blt         dump_r_loop
     adr         r0, end
     bl          out_string

     @ And exit
     mov         r0, #24
     orr         r1, r4, #0x20000
     svc         0x00123456

     @ Output the string in r0
 out_string:
     push        {lr}
     mov         r1, r0
     mov         r0, #4
     svc         0x00123456
     pop         {pc}

     @ Output a New-line
 out_nl:
     mov r0, #10
     @ Fallthrough

     @ Output the character in r0
 out_char:
     push        {lr}
     strb        r0, [sp, #-4]!
     mov         r0, #3
     mov         r1, sp
     svc         0x00123456
     add         sp, sp, #4
     pop         {pc}

     @ Output the value of r0 as a hex-word
 out_word:
     push        {r4, r5, r6, lr}
     mov         r4, r0
     mov         r5, #28
     adr         r6, hexchars
 word_loop:
     lsr         r0, r4, r5
     and         r0, r0, #15
     ldrb        r0, [r6, r0]
     bl          out_char
     subs        r5, r5, #4
     bpl         word_loop
     pop         {r4, r5, r6, pc}

 hexchars:
     .ascii	"0123456789abcdef"

 which_vector:
     .asciz	"Hit vector:"
 end:
     .asciz	"End.\n"

 vector_names:
     .asciz	"reset     "
     .asciz	"undef     "
     .asciz	"swi       "
     .asciz	"prefetch  "
     .asciz	"data abort"
     .asciz	"reserved  "
     .asciz	"irq       "
     .asciz	"fiq       "

 register_names:
     .asciz	"apsr "
     .asciz	"spsr "
     .asciz	"r0   "
     .asciz	"r1   "
     .asciz	"r2   "
     .asciz	"r3   "
     .asciz	"r4   "
     .asciz	"r5   "
     .asciz	"r6   "
     .asciz	"r7   "
     .asciz	"r8   "
     .asciz	"r9   "
     .asciz	"r10  "
     .asciz	"r11  "
     .asciz	"r12  "
     .asciz	"r14  "

     .p2align      3


     @ Enable the caches
 __enable_caches:
     mov         r0, #0
     mcr         15, 0, r0, cr8, cr7, 0  @ Invalidate all unified-TLB
     mov         r0, #0
     mcr         15, 0, r0, cr7, cr5, 6  @ Invalidate branch predictor
     mrc         15, 0, r4, cr1, cr0, 0  @ Read SCTLR
     orr         r4, r4, #0x800          @ Enable branch predictor
     mcr         15, 0, r4, cr1, cr0, 0  @ Set SCTLR
     mov         r5, lr                  @ Save LR as we're going to BL
     mrc         15, 0, r4, cr1, cr0, 0  @ Read SCTLR
     bl          init_cpu_client_enable_icache
     cmp         r0, #0
     it		ne
     orrne       r4, r4, #0x1000         @ Enable I-Cache
     bl          init_cpu_client_enable_dcache
     cmp         r0, #0
     it		ne
     orrne       r4, r4, #4
     mcr         15, 0, r4, cr1, cr0, 0  @ Enable D-Cache
     bx          r5                      @ Return

 __reset_caches:
     mov         ip, lr                  @ Save LR
     mov         r0, #0
     mcr         15, 0, r0, cr7, cr5, 6  @ Invalidate branch predictor
     mrc         15, 0, r6, cr1, cr0, 0  @ Read SCTLR
     mrc         15, 0, r0, cr1, cr0, 0  @ Read SCTLR!
     bic         r0, r0, #0x1000         @ Disable I cache
     mcr         15, 0, r0, cr1, cr0, 0  @ Write SCTLR
     mrc         15, 1, r0, cr0, cr0, 1  @ Read CLIDR
     tst         r0, #3                  @ Harvard Cache?
     mov         r0, #0
     it		ne
     mcrne       15, 0, r0, cr7, cr5, 0  @ Invalidate Instruction Cache?

     mrc         15, 0, r1, cr1, cr0, 0  @ Read SCTLR (again!)
     orr         r1, r1, #0x800          @ Enable branch predictor

                                         @ If we're not enabling caches we have
                                         @ no more work to do.
     bl          init_cpu_client_enable_icache
     cmp         r0, #0
     it		ne
     orrne       r1, r1, #0x1000         @ Enable I-Cache now -
                                         @ We actually only do this if we have a
                                         @ Harvard style cache.
     it		eq
     bleq        init_cpu_client_enable_dcache
     itt		eq
     cmpeq       r0, #0
     beq         Finished1

     mcr         15, 0, r1, cr1, cr0, 0  @ Write SCTLR (turn on Branch predictor & I-cache)

     mrc         15, 1, r0, cr0, cr0, 1  @ Read CLIDR
     ands        r3, r0, #0x7000000
     lsr         r3, r3, #23             @ Total cache levels << 1
     beq         Finished1

     mov         lr, #0                  @ lr = cache level << 1
 Loop11:
     mrc         15, 1, r0, cr0, cr0, 1  @ Read CLIDR
     add         r2, lr, lr, lsr #1      @ r2 holds cache 'set' position
     lsr         r1, r0, r2              @ Bottom 3-bits are Ctype for this level
     and         r1, r1, #7              @ Get those 3-bits alone
     cmp         r1, #2
     blt         Skip1                   @ No cache or only I-Cache at this level
     mcr         15, 2, lr, cr0, cr0, 0  @ Write CSSELR
     mov         r1, #0
     isb         sy
     mrc         15, 1, r1, cr0, cr0, 0  @ Read CCSIDR
     and         r2, r1, #7              @ Extract line length field
     add         r2, r2, #4              @ Add 4 for the line length offset (log2 16 bytes)
     movw        r0, #0x3ff
     ands        r0, r0, r1, lsr #3      @ r0 is the max number on the way size
     clz         r4, r0                  @ r4 is the bit position of the way size increment
     movw        r5, #0x7fff
     ands        r5, r5, r1, lsr #13     @ r5 is the max number of the index size (right aligned)
 Loop21:
     mov r7, r0                          @ r7 working copy of max way size
 Loop31:
     orr         r1, lr, r7, lsl r4      @ factor in way number and cache number
     orr         r1, r1, r5, lsl r2      @ factor in set number
     tst         r6, #4                  @ D-Cache on?
     ite         eq
     mcreq       15, 0, r1, cr7, cr6, 2  @ No - invalidate by set/way
     mcrne       15, 0, r1, cr7, cr14, 2 @ yes - clean + invalidate by set/way
     subs        r7, r7, #1              @ Decrement way number
     bge         Loop31
     subs        r5, r5, #1              @ Decrement set number
     bge         Loop21
 Skip1:
     add         lr, lr, #2              @ increment cache number
     cmp         r3, lr
     bgt         Loop11
 Finished1:
     @ Now we know the caches are clean we can:
     mrc         15, 0, r4, cr1, cr0, 0  @ Read SCTLR
     bic         r4, r4, #4              @ Disable D-Cache
     mcr         15, 0, r4, cr1, cr0, 0  @ Write SCTLR
     mov         r4, #0
     mcr         15, 0, r4, cr7, cr5, 6  @ Write BPIALL

     bx          ip                      @ Return

     @ Set Z if this is a Cortex-A15 or Cortex_A7
     @ Other flags corrupted
 is_a15_a7:
     mrc         15, 0, r8, c0, c0, 0
     movw        r9, #0xfff0
     movt        r9, #0xff0f
     and         r8, r8, r9
     movw        r9, #0xc0f0
     movt        r9, #0x410f
     cmp         r8, r9
     movw        r9, #0xc070
     movt        r9, #0x410f
     it		ne
     cmpne       r8, r9
     bx          lr

     @ Descriptor type: Section
     @ Bufferable: True
     @ Cacheable: True
     @ Execute Never: False
     @ Domain: 0
     @ Impl. Defined: 0
     @ Access: 0/11 Full access
     @ TEX: 001
     @ Shareable: False
     @ Not Global: False
     @ Supersection: False
 #define PT(X) \
     .word	X;
 #define PT2(X) \
     PT(X)  PT(X + 0x100000)    PT(X + 0x200000)    PT(X + 0x300000)
 #define PT3(X) \
     PT2(X) PT2(X + 0x400000)   PT2(X + 0x800000)   PT2(X + 0xc00000)
 #define PT4(X) \
     PT3(X) PT3(X + 0x1000000)  PT3(X + 0x2000000)  PT3(X + 0x3000000)
 #define PT5(X) \
     PT4(X) PT4(X + 0x4000000)  PT4(X + 0x8000000)  PT4(X + 0xc000000)
 #define PT6(X) \
     PT5(X) PT5(X + 0x10000000) PT5(X + 0x20000000) PT5(X + 0x30000000)
 #define PT7(X) \
     PT6(X) PT6(X + 0x40000000) PT6(X + 0x80000000) PT6(X + 0xc0000000)

     .section    page_tables_section, "aw", %progbits
     .p2align    14
 page_tables:
      PT7(0x1c0e)

 #endif //#if defined(__ARM_ARCH_7A__) || __ARM_ARCH_PROFILE == 'A'
	/* Copyright (c) 2005-2013 ARM Ltd. All rights reserved.

	Redistribution and use in source and binary forms, with or without
	modification, are permitted provided that the following conditions
	are met:
	1. Redistributions of source code must retain the above copyright
	notice, this list of conditions and the following disclaimer.
	2. Redistributions in binary form must reproduce the above copyright
	notice, this list of conditions and the following disclaimer in the
	documentation and/or other materials provided with the distribution.
	3. The name of the company may not be used to endorse or promote
	products derived from this software without specific prior written
	permission.

	THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
	WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
	MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
	IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
	SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
	TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
	PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
	LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
	NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
	SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */

	/* This file gives a basic initialisation of a Cortex-A series core. It is
	the bare minimum required to get Cortex-A core running with a semihosting
	interface.

	It sets up a basic 1:1 phsyical address to virtual address mapping;
	turns the MMU on; enables branch prediction; activates any integrated
	caches; enables the Advanced SIMD and VFP co-processors; and installs
	basic exception handlers.

	It does not handle peripherals, and assumes all memory is Normal.

	It does not change processor state from the startup privilege and security
	level.

	This has only been tested to work in ARM state.

	By default it assumes exception vectors are located from address 0.
	However, if this is not true they can be moved by defining the
	_rdimon_vector_base symbol. For example if you have HIVECS enabled you
	may pass --defsym _rdimon_vector_base=0xffff0000 on the linker command
	line. */

	/* __ARM_ARCH_PROFILE is defined from GCC 4.8 onwards, however __ARM_ARCH_7A
	has been defined since 4.2 onwards, which is when v7-a support was added
	and hence 'A' profile support was added in the compiler. Allow for this
	file to be built with older compilers. */
	#if defined(__ARM_ARCH_7A__) \|\| (__ARM_ARCH_PROFILE == 'A')
	.syntax unified
	.arch armv7-a
	.arm

	@ CPU Initialisation
	.globl _rdimon_hw_init_hook
	.type _rdimon_hw_init_hook, %function

	_rdimon_hw_init_hook:
	@ Only run the code on CPU 0 - otherwise spin
	mrc 15, 0, r4, cr0, cr0, 5 @ Read MPIDR
	ands r4, r4, #15
	spin:
	bne spin

	mov r10, lr @ Save LR for final return

	#ifdef __ARMEB__
	@ Setup for Big Endian
	setend be
	mrc 15, 0, r4, cr1, cr0, 0 @ Read SCTLR
	orr r4, r4, #(1<<25) @ Switch to Big Endian (Set SCTLR.EE)
	mcr 15, 0, r4, cr1, cr0, 0 @ Write SCTLR
	#else
	@ Setup for Little Endian
	setend le
	mrc 15, 0, r4, cr1, cr0, 0 @ Read SCTLR
	bic r4, r4, #(1<<25) @ Switch to LE (unset SCTLR.EE)
	mcr 15, 0, r4, cr1, cr0, 0 @ Write SCTLR
	#endif

	bl is_a15_a7

	@ For Cortex-A15 and Cortex-A7 only:
	@ Write zero into the ACTLR to turn everything on.
	itt eq
	moveq r4, #0
	mcreq 15, 0, r4, c1, c0, 1
	isb

	@ For Cortex-A15 and Cortex-A7 only:
	@ Set ACTLR:SMP bit before enabling the caches and MMU,
	@ or performing any cache and TLB maintenance operations.
	ittt eq
	mrceq 15, 0, r4, c1, c0, 1 @ Read ACTLR
	orreq r4, r4, #(1<<6) @ Enable ACTLR:SMP
	mcreq 15, 0, r4, c1, c0, 1 @ Write ACTLR
	isb

	@ Setup for exceptions being taken to Thumb/ARM state
	mrc 15, 0, r4, cr1, cr0, 0 @ Read SCTLR
	#if defined(__thumb__)
	orr r4, r4, #(1 << 30) @ Enable SCTLR.TE
	#else
	bic r4, r4, #(1 << 30) @ Disable SCTLR.TE
	#endif
	mcr 15, 0, r4, cr1, cr0, 0 @ Write SCTLR

	bl __reset_caches

	mrc 15, 0, r4, cr1, cr0, 0 @ Read SCTLR
	orr r4, r4, #(1<<22) @ Enable unaligned mode
	bic r4, r4, #2 @ Disable alignment faults
	bic r4, r4, #1 @ Disable MMU
	mcr 15, 0, r4, cr1, cr0, 0 @ Write SCTLR

	mov r4, #0
	mcr 15, 0, r4, cr8, cr7, 0 @ Write TLBIALL - Invaliidate unified
	@ TLB
	@ Setup MMU Primary table P=V mapping.
	mvn r4, #0
	mcr 15, 0, r4, cr3, cr0, 0 @ Write DACR

	mov r4, #0 @ Always use TTBR0, no LPAE
	mcr 15, 0, r4, cr2, cr0, 2 @ Write TTBCR
	adr r4, page_table_addr @ Load the base for vectors
	ldr r4, [r4]
	mrc p15, 0, r0, c0, c0, 5 @ read MPIDR
	tst r0, #0x80000000 @ bis[31]
	@ Set page table flags - there are two page table flag formats for the
	@ architecture. For systems without multiprocessor extensions we use 0x1
	@ which is Inner cacheable/Outer non-cacheable. For systems with
	@ multiprocessor extensions we use 0x59 which is Inner/Outer write-back,
	@ no write-allocate, and cacheable. See the ARMARM-v7AR for more details.
	it ne
	addne r4, r4, #0x58
	add r4, r4, #1

	mcr 15, 0, r4, cr2, cr0, 0 @ Write TTBR0

	mov r0, #34 @ 0x22 @ TR0 and TR1 - normal memory
	orr r0, r0, #(1 << 19) @ Shareable
	mcr 15, 0, r0, cr10, cr2, 0 @ Write PRRR
	movw r0, #0x33
	movt r0, #0x33
	mcr 15, 0, r0, cr10, cr2, 1 @ Write NMRR
	mrc 15, 0, r0, cr1, cr0, 0 @ Read SCTLR
	bic r0, r0, #(1 << 28) @ Clear TRE bit
	mcr 15, 0, r0, cr1, cr0, 0 @ Write SCTLR

	@ Now install the vector code - we move the Vector code from where it is
	@ in the image to be based at _rdimon_vector_base. We have to do this copy
	@ as the code is all PC-relative. We actually cheat and do a BX <reg> so
	@ that we are at a known address relatively quickly and have to move as
	@ little code as possible.
	mov r7, #(VectorCode_Limit - VectorCode)
	adr r5, VectorCode
	adr r6, vector_base_addr @ Load the base for vectors
	ldr r6, [r6]

	copy_loop: @ Do the copy
	ldr r4, [r5], #4
	str r4, [r6], #4
	subs r7, r7, #4
	bne copy_loop

	mrc 15, 0, r4, cr1, cr0, 0 @ Read SCTLR
	bic r4, r4, #0x1000 @ Disable I Cache
	bic r4, r4, #4 @ Disable D Cache
	orr r4, r4, #1 @ Enable MMU
	bic r4, r4, #(1 << 28) @ Clear TRE bit
	mcr 15, 0, r4, cr1, cr0, 0 @ Write SCTLR
	mrc 15, 0, r4, cr1, cr0, 2 @ Read CPACR
	orr r4, r4, #0x00f00000 @ Turn on VFP Co-procs
	bic r4, r4, #0x80000000 @ Clear ASEDIS bit
	mcr 15, 0, r4, cr1, cr0, 2 @ Write CPACR
	isb
	mov r4, #0
	mcr 15, 0, r4, cr7, cr5, 4 @ Flush prefetch buffer
	mrc 15, 0, r4, cr1, cr0, 2 @ Read CPACR
	ubfx r4, r4, #20, #4 @ Extract bits [20, 23)
	cmp r4, #0xf @ If not all set then the CPU does not
	itt eq @ have FP or Advanced SIMD.
	moveq r4, #0x40000000 @ Enable FP and Advanced SIMD
	mcreq 10, 7, r4, cr8, cr0, 0 @ vmsr fpexc, r4
	skip_vfp_enable:
	bl __enable_caches @ Turn caches on
	bx r10 @ Return to CRT startup routine

	@ This enable us to be more precise about which caches we want
	init_cpu_client_enable_dcache:
	init_cpu_client_enable_icache:
	mov r0, #1
	bx lr

	vector_base_addr:
	.word _rdimon_vector_base
	.weak _rdimon_vector_base
	page_table_addr:
	.word page_tables

	@ Vector code - must be PIC and in ARM state.
	VectorCode:
	b vector_reset
	b vector_undef
	b vector_swi
	b vector_prefetch
	b vector_dataabt
	b vector_reserved
	b vector_irq
	b vector_fiq

	vector_reset:
	adr sp, vector_sp_base
	push {r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, sl, fp, ip, lr}
	mov r4, #0
	b vector_common
	vector_undef:
	adr sp, vector_sp_base
	push {r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, sl, fp, ip, lr}
	mov r4, #1
	b vector_common
	vector_swi:
	adr sp, vector_sp_base
	push {r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, sl, fp, ip, lr}
	mov r4, #2
	b vector_common
	vector_prefetch:
	adr sp, vector_sp_base
	push {r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, sl, fp, ip, lr}
	mov r4, #3
	b vector_common
	vector_dataabt:
	adr sp, vector_sp_base
	push {r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, sl, fp, ip, lr}
	mov r4, #4
	b vector_common
	vector_reserved:
	adr sp, vector_sp_base
	push {r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, sl, fp, ip, lr}
	mov r4, #5
	b vector_common
	vector_irq:
	adr sp, vector_sp_base
	push {r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, sl, fp, ip, lr}
	mov r4, #6
	b vector_common
	vector_fiq:
	adr sp, vector_sp_base
	push {r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, sl, fp, ip, lr}
	mov r4, #7
	b vector_common
	vector_common:
	adr r1, vector_common_adr @ Find where we're going to
	ldr r1, [r1]
	bx r1 @ And branch there
	vector_common_adr:
	.word vector_common_2 @ Common handling code

	@ Vector stack
	.p2align 3 @ Align to 8 byte boundary boundary to
	@ keep ABI compatibility
	.fill 32, 4, 0 @ 32-entry stack is enough for vector
	@ handlers.
	vector_sp_base:
	VectorCode_Limit:
	@ End of PIC code for vectors

	@ Common Handling of vectors
	.type vector_common_2, %function
	vector_common_2:
	mrs r1, APSR
	mrs r2, SPSR
	push {r1, r2} @ Save PSRs

	@ Output the vector we have caught
	bl out_nl
	adr r0, which_vector
	bl out_string
	adr r0, vector_names
	mov r1, #11
	mla r0, r4, r1, r0
	bl out_string
	bl out_nl

	@ Dump the registers
	adrl r6, register_names
	mov r7, #0
	dump_r_loop:
	mov r0, r6
	bl out_string
	add r6, r6, #6
	ldr r0, [sp, r7, lsl #2]
	bl out_word
	bl out_nl
	add r7, r7, #1
	cmp r7, #16
	blt dump_r_loop
	adr r0, end
	bl out_string

	@ And exit
	mov r0, #24
	orr r1, r4, #0x20000
	svc 0x00123456

	@ Output the string in r0
	out_string:
	push {lr}
	mov r1, r0
	mov r0, #4
	svc 0x00123456
	pop {pc}

	@ Output a New-line
	out_nl:
	mov r0, #10
	@ Fallthrough

	@ Output the character in r0
	out_char:
	push {lr}
	strb r0, [sp, #-4]!
	mov r0, #3
	mov r1, sp
	svc 0x00123456
	add sp, sp, #4
	pop {pc}

	@ Output the value of r0 as a hex-word
	out_word:
	push {r4, r5, r6, lr}
	mov r4, r0
	mov r5, #28
	adr r6, hexchars
	word_loop:
	lsr r0, r4, r5
	and r0, r0, #15
	ldrb r0, [r6, r0]
	bl out_char
	subs r5, r5, #4
	bpl word_loop
	pop {r4, r5, r6, pc}

	hexchars:
	.ascii "0123456789abcdef"

	which_vector:
	.asciz "Hit vector:"
	end:
	.asciz "End.\n"

	vector_names:
	.asciz "reset "
	.asciz "undef "
	.asciz "swi "
	.asciz "prefetch "
	.asciz "data abort"
	.asciz "reserved "
	.asciz "irq "
	.asciz "fiq "

	register_names:
	.asciz "apsr "
	.asciz "spsr "
	.asciz "r0 "
	.asciz "r1 "
	.asciz "r2 "
	.asciz "r3 "
	.asciz "r4 "
	.asciz "r5 "
	.asciz "r6 "
	.asciz "r7 "
	.asciz "r8 "
	.asciz "r9 "
	.asciz "r10 "
	.asciz "r11 "
	.asciz "r12 "
	.asciz "r14 "

	.p2align 3


	@ Enable the caches
	__enable_caches:
	mov r0, #0
	mcr 15, 0, r0, cr8, cr7, 0 @ Invalidate all unified-TLB
	mov r0, #0
	mcr 15, 0, r0, cr7, cr5, 6 @ Invalidate branch predictor
	mrc 15, 0, r4, cr1, cr0, 0 @ Read SCTLR
	orr r4, r4, #0x800 @ Enable branch predictor
	mcr 15, 0, r4, cr1, cr0, 0 @ Set SCTLR
	mov r5, lr @ Save LR as we're going to BL
	mrc 15, 0, r4, cr1, cr0, 0 @ Read SCTLR
	bl init_cpu_client_enable_icache
	cmp r0, #0
	it ne
	orrne r4, r4, #0x1000 @ Enable I-Cache
	bl init_cpu_client_enable_dcache
	cmp r0, #0
	it ne
	orrne r4, r4, #4
	mcr 15, 0, r4, cr1, cr0, 0 @ Enable D-Cache
	bx r5 @ Return

	__reset_caches:
	mov ip, lr @ Save LR
	mov r0, #0
	mcr 15, 0, r0, cr7, cr5, 6 @ Invalidate branch predictor
	mrc 15, 0, r6, cr1, cr0, 0 @ Read SCTLR
	mrc 15, 0, r0, cr1, cr0, 0 @ Read SCTLR!
	bic r0, r0, #0x1000 @ Disable I cache
	mcr 15, 0, r0, cr1, cr0, 0 @ Write SCTLR
	mrc 15, 1, r0, cr0, cr0, 1 @ Read CLIDR
	tst r0, #3 @ Harvard Cache?
	mov r0, #0
	it ne
	mcrne 15, 0, r0, cr7, cr5, 0 @ Invalidate Instruction Cache?

	mrc 15, 0, r1, cr1, cr0, 0 @ Read SCTLR (again!)
	orr r1, r1, #0x800 @ Enable branch predictor

	@ If we're not enabling caches we have
	@ no more work to do.
	bl init_cpu_client_enable_icache
	cmp r0, #0
	it ne
	orrne r1, r1, #0x1000 @ Enable I-Cache now -
	@ We actually only do this if we have a
	@ Harvard style cache.
	it eq
	bleq init_cpu_client_enable_dcache
	itt eq
	cmpeq r0, #0
	beq Finished1

	mcr 15, 0, r1, cr1, cr0, 0 @ Write SCTLR (turn on Branch predictor & I-cache)

	mrc 15, 1, r0, cr0, cr0, 1 @ Read CLIDR
	ands r3, r0, #0x7000000
	lsr r3, r3, #23 @ Total cache levels << 1
	beq Finished1

	mov lr, #0 @ lr = cache level << 1
	Loop11:
	mrc 15, 1, r0, cr0, cr0, 1 @ Read CLIDR
	add r2, lr, lr, lsr #1 @ r2 holds cache 'set' position
	lsr r1, r0, r2 @ Bottom 3-bits are Ctype for this level
	and r1, r1, #7 @ Get those 3-bits alone
	cmp r1, #2
	blt Skip1 @ No cache or only I-Cache at this level
	mcr 15, 2, lr, cr0, cr0, 0 @ Write CSSELR
	mov r1, #0
	isb sy
	mrc 15, 1, r1, cr0, cr0, 0 @ Read CCSIDR
	and r2, r1, #7 @ Extract line length field
	add r2, r2, #4 @ Add 4 for the line length offset (log2 16 bytes)
	movw r0, #0x3ff
	ands r0, r0, r1, lsr #3 @ r0 is the max number on the way size
	clz r4, r0 @ r4 is the bit position of the way size increment
	movw r5, #0x7fff
	ands r5, r5, r1, lsr #13 @ r5 is the max number of the index size (right aligned)
	Loop21:
	mov r7, r0 @ r7 working copy of max way size
	Loop31:
	orr r1, lr, r7, lsl r4 @ factor in way number and cache number
	orr r1, r1, r5, lsl r2 @ factor in set number
	tst r6, #4 @ D-Cache on?
	ite eq
	mcreq 15, 0, r1, cr7, cr6, 2 @ No - invalidate by set/way
	mcrne 15, 0, r1, cr7, cr14, 2 @ yes - clean + invalidate by set/way
	subs r7, r7, #1 @ Decrement way number
	bge Loop31
	subs r5, r5, #1 @ Decrement set number
	bge Loop21
	Skip1:
	add lr, lr, #2 @ increment cache number
	cmp r3, lr
	bgt Loop11
	Finished1:
	@ Now we know the caches are clean we can:
	mrc 15, 0, r4, cr1, cr0, 0 @ Read SCTLR
	bic r4, r4, #4 @ Disable D-Cache
	mcr 15, 0, r4, cr1, cr0, 0 @ Write SCTLR
	mov r4, #0
	mcr 15, 0, r4, cr7, cr5, 6 @ Write BPIALL

	bx ip @ Return

	@ Set Z if this is a Cortex-A15 or Cortex_A7
	@ Other flags corrupted
	is_a15_a7:
	mrc 15, 0, r8, c0, c0, 0
	movw r9, #0xfff0
	movt r9, #0xff0f
	and r8, r8, r9
	movw r9, #0xc0f0
	movt r9, #0x410f
	cmp r8, r9
	movw r9, #0xc070
	movt r9, #0x410f
	it ne
	cmpne r8, r9
	bx lr

	@ Descriptor type: Section
	@ Bufferable: True
	@ Cacheable: True
	@ Execute Never: False
	@ Domain: 0
	@ Impl. Defined: 0
	@ Access: 0/11 Full access
	@ TEX: 001
	@ Shareable: False
	@ Not Global: False
	@ Supersection: False
	#define PT(X) \
	.word X;
	#define PT2(X) \
	PT(X) PT(X + 0x100000) PT(X + 0x200000) PT(X + 0x300000)
	#define PT3(X) \
	PT2(X) PT2(X + 0x400000) PT2(X + 0x800000) PT2(X + 0xc00000)
	#define PT4(X) \
	PT3(X) PT3(X + 0x1000000) PT3(X + 0x2000000) PT3(X + 0x3000000)
	#define PT5(X) \
	PT4(X) PT4(X + 0x4000000) PT4(X + 0x8000000) PT4(X + 0xc000000)
	#define PT6(X) \
	PT5(X) PT5(X + 0x10000000) PT5(X + 0x20000000) PT5(X + 0x30000000)
	#define PT7(X) \
	PT6(X) PT6(X + 0x40000000) PT6(X + 0x80000000) PT6(X + 0xc0000000)

	.section page_tables_section, "aw", %progbits
	.p2align 14
	page_tables:
	PT7(0x1c0e)

	#endif //#if defined(__ARM_ARCH_7A__) \|\| __ARM_ARCH_PROFILE == 'A'