/*
 * Copyright (c) 2006-2019, RT-Thread Development Team
 *
 * SPDX-License-Identifier: Apache-2.0
 *
 * Origin Authors: Loongson Technology Corporation Limited,
 * caogos <1207280597@qq.com>, Jiaxun Yang <jiaxun.yang@flygoat.com>,
 *
 * Also thanks to Liu Shiwei <liushiwei@gmail.com> and other Loongson
 * Community developers.
 *
 * Change Logs:
 * Date           Author       Notes
 * 2019-12-04     Jiaxun Yang  Initial version
 */

#include <rtconfig.h>

#ifdef RT_USING_SELF_BOOT

#ifndef __ASSEMBLY__
#define __ASSEMBLY__
#endif

#include <mips.h>
#include "selfboot.h"
#include "ls1c.h"
#include "cache.h"

/*
 *   Register usage:
 *
 *	s0	link versus load offset, used to relocate absolute adresses.
 *	s1	free
 *	s2	memory size
 *	s3	free
 *	s4	free
 *	s5	dbg
 *	s6	sdCfg
 *	s7	rasave
 *	s8	free
 */

#define tmpsize		s1
#define msize		s2
#define bonito		s4
#define dbg			s5
#define sdCfg		s6


/* Macros */
#ifdef RT_SELF_BOOT_DEBUG
#define	PRINTSTR(str) \
    .pushsection .selfboot_data; .align 4; 98: .asciz str; .popsection; la a0, 98b; bal stringserial; nop
#else
#define PRINTSTR(x)
#endif

#define	DELAY(count)	\
    li v0, count;	\
99:			\
    bnez	v0, 99b;\
    addiu	v0, -1

.section ".selfboot", "ax"
.set	noreorder
.set	mips32
.globl	_start
.extern	start
_start:

/* NMI/Reset vector starts here*/
  mtc0  zero, CP0_STATUS /* set cp0 status register to zero */
  mtc0  zero, CP0_CAUSE /* set cp0 cause register to zero */

  li  t0, ST0_BEV /* set exception vector to in flash location */
  mtc0 t0, CP0_STATUS

  /* Speed up SPI reading */
  li  t0, 0xbfe80000 /* load SPI0 controler base address to t0 */
  li  t1, 0x17 /* load "div 4, fast_read + burst_en + memory_en double I/O" to
                * to t0 for write, not all the flash chips support this mode */
  sb  t1, 0x4(t0) /* set sfc_param register */
  li  t1, 0x05
  sb  t1, 0x6(t0) /* set sfc_timing register */

  bal	locate			/* branch out of vector and get current address to ra */
    nop

/* in-flash exception vectors start here */
/* save the exception types to a0 and print out PANIC message in exc_common */
#define EXC_TLB_REFILL 0x0
#define EXC_CACHE_ERR	0x1
#define EXC_GEN	0x2
#define EXC_INT	0x3

.org 0x200 /* 0xbfc00200 TLB_REFILL exception */
  li a0, EXC_TLB_REFILL
  b exc_common
  nop

.org 0x300 /* 0xbfc00300 Cache Error exception */
  li a0, EXC_CACHE_ERR
  b exc_common
  nop

.org 0x380 /* 0xbfc00300 General exception */
  li	a0,EXC_GEN
  b exc_common
  nop

.org 0x400 /* 0xbfc00400 Interrupt exception */
  li a0, EXC_INT
  b exc_common
  nop

1: /* impossible to reach here, so make a dead loop */
  b 1b
  nop

exc_common: /* try one cause and pass to next */
  li	s1, EXC_TLB_REFILL
  bne a0, s1, 1f
  nop
  PRINTSTR("\r\nEARLY_PANIC: Exception TLB Refill")
  b print_cause
  nop
1:
  li	s1, EXC_CACHE_ERR
  bne a0, s1, 1f
  nop
  PRINTSTR("\r\nEARLY_PANIC: CACHE Error: ")
  mfc0 a0, CP0_CACHEERR
  bal hexserial
  nop
  b print_cause
  nop
1:
  li	s1, EXC_GEN
  bne a0, s1, 1f
  nop
  PRINTSTR("\r\nEARLY_PANIC: General Exception")
  b print_cause
  nop
1:
  li	s1, EXC_INT
  bne a0, s1, print_cause /* if all exceptions in a0 not reached,
                          * print_cause directly*/
  nop
  PRINTSTR("\r\nEARLY_PANIC: Interrupt Exception")
print_cause:
  PRINTSTR("\r\nCAUSE=")
  mfc0	a0, CP0_CAUSE
  bal	hexserial
  nop

  PRINTSTR("\r\nSTATUS=")
  mfc0	a0, CP0_STATUS
  bal	hexserial
  nop

  PRINTSTR("\r\nERRORPC=")
  mfc0	a0, CP0_ERROREPC
  bal	hexserial
  nop

  PRINTSTR("\r\nEPC=")
  mfc0	a0, CP0_EPC
  bal	hexserial
  nop

  PRINTSTR("\r\nBADADDR=")
  mfc0	a0, CP0_BADVADDR
  bal	hexserial
  nop

  PRINTSTR("\r\nEARLY: LOOP! Noting to do")
1: /* Make a dead loop here, wait user to reset the MCU */
  b 1b
  nop

/* locate here, continue the start progress */

locate:
  /* fix the absolute address by ra */
  la  s0, start /* s0 = start */
  subu  s0, ra, s0 /* s0 = ra - s0 */
  and s0, 0xffff0000 /* s0 = s0 & 0xffff0000 */

  li	t0, 0xbfe78030 /* load PLL/SDRAM freq config register base to t0 */
  li	t2, (0x80000008 | (PLL_MULT << 8) | (0x3 << 2) | SDRAM_DIV) /* Set PLL
                                                                  * MULT and PLL DIV */
  li	t3, (0x00008003 | (CPU_DIV << 8)) /* set CPU DEV */

  li	t1, 0x2
    sw	t1, 0x4(t0) /* disable CPU_DIV_VALID firstly for adjustment */
    sw	t2, 0x0(t0) /* write START_FREQ */
    sw	t3, 0x4(t0) /* write CLK_DIV_PARAM */

/* start to initialize debug uart port */
  la	v0, LS1C_UART2_BASE /* load UART2 base to v0, only UART2 can be debug port */
1:
    li	v1, FIFO_ENABLE|FIFO_RCV_RST|FIFO_XMT_RST|FIFO_TRIGGER_4 /* clear Rx,Tx FIFO
                                                                * declear 4 bit int trigger */
    sb	v1, LS1C_UART_FCR_OFFSET(v0) /* write FCR (FIFO control register) */
    li	v1, CFCR_DLAB /* reach freq div register */
    sb	v1, LS1C_UART_LCR_OFFSET(v0) /* write LCR (Line control register)*/
    /* Set UART2 reuse with GPIO36,37*/
    li		a0, LS1C_CBUS_FIRST1 /* load CBUS_FIRST1 offset to a0 */
    lw		a1, 0x10(a0) /* load value from CBUS_SECOND1 to a1 */
    ori		a1, 0x30 /* a1 |= 0x30, GPIO36,37 as secondary function */
    sw		a1, 0x10(a0) /* write back modified CBUS_SECOND1 from a1 */
  /* Caculate PLL and bit rate */
    li		a0, 0xbfe78030 /* load START_FREQ register address to a0 */
    lw		a1, 0(a0) /* load value from START_FREQ to a1*/
    srl		a1, 8 /* a1 >>= 8 */
    andi	a1, 0xff /* a1 &= 0xff, as a1=PLL_MULT */
    li		a2, APB_CLK /* a2 = APB_CLK = 24Mhz (External Clock Freq) */
    srl		a2, 2 /* a2 = a2 >> 2 = APB_CLK/4 */
    multu	a1, a2 /* hilo = a1 * a2 = PLL_MULT * APB_CLK /4 */
    mflo	v1 /* v1 = lo. put low 32 bit of a1 * a2 to v1 as PLL freq */
    /* Determine if we need to devide the clock */
    lw		a1, 4(a0) /* load value frm CLK_DIV_PARAM to a1 */
    andi	a2, a1, DIV_CPU_SEL /* a2 = a1 & DIV_CPU_SEL, if CPU_SEL=1, devide the clock,
                            * if CPU_SEL=0, bypass the clock */
    bnez	a2, 1f /* if (a2 != 0), branch to next tag 1 */
    nop
    li		v1, APB_CLK /* v1 = APB_CLK */
    b		3f
    nop
1: /* Determine if the CPU_DIV is valid*/
    andi	a2, a1, DIV_CPU_EN /* a2 = a1 & DIV_CPU_EN */
    bnez	a2, 2f /* if (a2 != 0), branch to next tag 2 */
    nop
    srl		v1, 1 /* v1 >>= 1, so v1 = APB_CLK/4 * PLL_MULT/2 */
    b		3f
    nop
2: /* caculate CPU freq */
    andi	a1, DIV_CPU /* a1 &= DIV_CPU */
    srl		a1, DIV_CPU_SHIFT /* a1 >>= DIV_CPU_SHIFT */
    divu	v1, a1 /* lo = v1/a1, hi = v1 % a1 */
    mflo	v1 /* v1 = lo, CPU Freq */
3:
    li		a1, (16 * EARLY_DEBUG_BAUD) /* a1 = 16 * BIT RATE */
    divu	v1, v1, a1 /* v1 = v1 / a1 */
    srl     v1, 1  /* v1 >>= 1 */
    sb	v1, LS1C_UART_LSB_OFFSET(v0) /* write 8bit low into LSB */
    srl	v1, 8 /* v1 >>= 8 */
    sb	v1, LS1C_UART_MSB_OFFSET(v0) /* write 8bit low into MSB */

    li	v1, CFCR_8BITS /* 8n1, no check */
    sb	v1, LS1C_UART_LCR_OFFSET(v0) /* write to LCR (Line Control Register) */

#ifdef EARLY_DEBUG_UART_FLOW_CTRL
    li	v1, MCR_DTR|MCR_RTS /* valid DTR and RTS */
    sb	v1, LS1C_UART_MCR_OFFSET(v0) /* write to MCR (MODEM Control Register) */
#endif

    li	v1, 0x0 /* disable all the interruptions */
    sb	v1, LS1C_UART_IER_OFFSET(v0) /* write to IER (Interruptions Enable Registers) */

    PRINTSTR("\r\INFO: Loongson 1C300 Starting :) \r\n")

  /* disable all GPIOs for conflict functions */
  li a0,0xbfd00000
  sw zero,0x10c0(a0)	/* disable GPIO 0-31 */
  sw zero,0x10c4(a0)	/* disable GPIO 32-63 */
  sw zero,0x10c8(a0)	/* disable GPIO 64-95 */
  sw zero,0x10cc(a0)

  li t0, 0xffffffff
  sw t0, 0x10d0(a0)
  sw t0, 0x10d4(a0)
  sw t0, 0x10d8(a0)
  sw t0, 0x10dc(a0)

  sw t0, 0x10f0(a0)
  sw t0, 0x10f4(a0)
  sw t0, 0x10f8(a0)
  sw t0, 0x10fc(a0)

  PRINTSTR("\r\INFO: All GPIOs are disabled\r\n")

  /* SDRAM initialize starts here */

  li msize, MEM_SIZE

#ifdef EJTAG_SEL_AS_SDRAM_CS1
  li		a0, 0xbfd011c0
  lw		a1, 0x40(a0)
  ori	a1, 0x01
  sw		a1, 0x40(a0)
  PRINTSTR("\r\INFO: EJTAG_SEL PIN as SDRAM_CS1\r\n")
#endif

/*
* recommanded by user manual, we should write SD_CONFIG[31:0] first, then
* write SD_CONFIG[63:32]. Repeat writing for three times, valid the config in
* the last time.
*/

/* write first time */
li  	t1, 0xbfd00410 /* load SD_CONFIG[31:0] address to t1 */
li		a1, SD_PARA0 /* get the memory config from macro SD_PARA0 */
sw		a1, 0x0(t1) /* write to SD_CONFIG[31:0] */
li		a1, SD_PARA1
sw		a1, 0x4(t1) /* write to SD_CONFIG[63:32] with offset */
PRINTSTR("\r\INFO: SDRAM Config Pass1\r\n")

/* write second time,the same */
li		a1, SD_PARA0
sw		a1, 0x0(t1)
li		a1, SD_PARA1
sw		a1, 0x4(t1)
PRINTSTR("\r\INFO: SDRAM Config Pass2\r\n")

/* write third time, enable controller this time */
li		a1, SD_PARA0
sw		a1, 0x0(t1)
li		a1, SD_PARA1_EN /* enable it */
sw		a1, 0x4(t1)
PRINTSTR("\r\INFO: SDRAM initialize compeleted\r\n")


/* initialize cache */
bal     cache_init /* branch to cache_init */
nop

/* enable cache */
mfc0   a0, CP0_CONFIG /* load cp0 config to a0 */
and    a0, a0, ~((1<<12) | 7) /* a0 = a0 & ~((1<<12) | 7) */
or     a0, a0, 2 /* a0 |= 2 */
mtc0   a0, CP0_CONFIG /* write back to CP0 config */

/*
* relocate: copy selfboot code to memory in kseg0, fix PC and jump to kseg0.
* in order to speed up the copy progress, we will execute copy code in kseg0
*/

PRINTSTR("\r\INFO: Relocating")
la		t0, text_copy_start /* load the adress of start tag to t0 */
move		t2, t0
addu		t0, s0 /* correct t0 address in rom by s0 */
la		t1, text_copy_end

selfboot_copy_loop:
lw		v0, (t0) /* copy from memory address in t0 to register v0  */
sw		v0, (t2) /* write data in register v0 to memory address t0 */
addiu		t0, 0x4 /* t0 moves forward 4 bytes */
addiu		t2, 0x4 /* t2 moves forward 4 bytes */
ble		t2, t1, selfboot_copy_loop /* if t1 <= t2 loop to continue the copy */
nop

la		t0, text_copy_start /* load start address to t0 */
jr		t0 /* jump to 122 in kseg0 to start copy code progress */
nop

text_copy_start:
/* Copy code to memory*/
la		a0, start /* load address of start symbol to a0  */
addu		a1, a0, s0 /* correct a0 to address in flash */
la		a2, _edata /* load symbol _edata address to a2 */
subu	t1, a2, a0 /* t1 = a2 - a0, the space of text area */

move	t0, a0 /* the start address in ram */
move	t1, a1 /* the start address in rom */
move	t2, a2 /* the end address in rom (symbol _edata) */

/* copy text section */
1:
and	t3, t0, 0x0000ffff /* t3 = t0 & 0x0000ffff, get low 16 bit */
bnez	t3, 2f /* if t3 != 0, jump to next tag 2 */
nop
2:
lw		t3, 0(t1) /* copy 4 bit from memory address t1 to register t3 */
nop
sw		t3, 0(t0) /* copy 4 bit from register t3 to memory address in t0 */
addu	t0, 4 /* t0 move forward 4 bytes */
addu	t1, 4 /* t1 move forward 4 bytes */
bne	t2, t0, 1b /* if t2 != t0, branch to last tag 1 to continue copy */
nop
/* copy text section done. */

move	a0, msize  /* a0 = msize, will be passed to main */
srl	a0, 20 /* a0 >>= 20, convert to unit in MB */

/* execute main */
la		v0, _rtthread_entry /* load address of function main to v0 */
jalr	v0 /* call address in v0, congrats! all low_level things done!
          * switch brain out of assembly */
nop
text_copy_end: /* end of self-copy in memory */

loop: /* impossible to reach here, make a dead loop */
b	loop
nop

/* functions here */

LEAF(stringserial) /* print out the string in address passed in a0  */
    nop
    move	a2, ra  /* save the return address to a2 */
    addu	a1, a0, s0 /* correct the address in ROM */
    lbu	a0, 0(a1) /* read the first byte in memory address a1 to a0 */
1:
    beqz	a0, 2f /* if a0 == 0, jump to next tag 2, empty char */
    nop
    bal	tgt_putchar /* print a char */
    addiu	a1, 1 /* a1 += 1 move forward to next byte */
    b	1b /* branch to the last tag 1, continue */
    lbu	a0, 0(a1) /* load the next bit from address a1 to a0, in delay solt,
                * will be execuated before branch */
2:
    j	a2 /* return */
    nop
END(stringserial)


LEAF(hexserial) /* print out single hex char passed in register a0 */
    nop
    move	a2, ra /* move return address from ra to a2  */
    move	a1, a0 /* move hex char from register a0 to a1 */
    li	a3, 7 /* load 7 to a3 */
1:
    rol	a0, a1, 4 /* rotate left ward shift for 4 bit in a1 to a0 */
    move	a1, a0
    and	a0, 0xf
    la	v0, hexchar

.pushsection .selfboot_data
.align	4
hexchar:
        .ascii	"0123456789abcdef"
.popsection
.align	4

    addu	v0, s0
    addu	v0, a0
    bal	tgt_putchar
    lbu	a0, 0(v0)

    bnez	a3, 1b
    addu	a3, -1

    j	a2
    nop
END(hexserial)

LEAF(tgt_putchar) /* print out a char in a0 */
    la	v0, LS1C_UART2_BASE /* load UART register address to a0 */
    lbu	v1, LS1C_UART_LSR_OFFSET(v0) /* load value from LSR to v0 */
1:
    and	v1, LSR_TXRDY /* v1 &= LSR_TXRDY determine wether we can send by TFE bit */
    beqz	v1, 1b /* if (v1 == 0) jump to last 1 tag, waiting until TFE is 1 */
    lbu	v1, LS1C_UART_LSR_OFFSET(v0) /* load value from LSR to v0 again, in delay solt */

    sb	a0, LS1C_UART_DAT_OFFSET(v0) /* write a0 into DAT, send out */
    j	ra /*  */
    nop
END(tgt_putchar)

LEAF(CPU_SetSR) /* modify SR value, arg 1 = set bits, arg 2 = clear bits. */
    mfc0    v0, CP0_STATUS
    not v1, a1
    and v1, v0
    or  v1, a0
    mtc0    v1, CP0_STATUS
    nop
        nop
        nop
        nop
        nop
        nop
        nop
        nop
    j   ra
    nop
END(CPU_SetSR)

cache_init:
    move t1, ra
####part 2####
cache_detect_4way:
    mfc0	t4, CP0_CONFIG,1 /* move CP0 CONFIG to t4 */
    lui		v0, 0x7 /* v0 = 0x7 << 16 */
    and		v0, t4, v0 /* v0 = t4 & v0 */
    srl		t3, v0, 16 /* t3 = v0 >> 16  Icache组相联数 IA */

    li		t5, 0x800 		//32*64
    srl		v1, t4,22		//v1 = t4 >> 22
    andi	v1, 7			//Icache每路的组数 64x2^S IS
    sll		t5, v1			//InstCacheSetSize
    sll		t5, t3			//t5 InstCacheSize


    andi	v0, t4, 0x0380
    srl		t7, v0, 7		//DA

    li		t6, 0x800       // 32*64
    srl		v1, t4,13
    andi	v1, 7			//DS
    sll		t6, v1          // DataCacheSetSize
    sll		t6, t7          // t5 DataCacheSize

####part 3####
    lui		a0, 0x8000			//a0 = 0x8000 << 16
    addu	a1, $0, t5
    addu	a2, $0, t6
cache_init_d2way:
/* a0=0x80000000, a1=icache_size, a2=dcache_size */
/* a3, v0 and v1 used as local registers */
    mtc0	$0, CP0_TAGHI
    addu	v0, $0, a0 /* v0 = 0 + a0 */
    addu	v1, a0, a2 /* v1 = a0 + a2 */
1:	slt		a3, v0, v1 /* a3 = v0 < v1 ? 1 : 0 */
    beq		a3, $0, 1f /* if (a3 == 0) goto 1f */
    nop
    mtc0	$0, CP0_TAGLO
    cache	Index_Store_Tag_D, 0x0(v0) /* 1 way */
4:	beq		$0, $0, 1b
    addiu	v0, v0, 0x20
1:
cache_flush_i2way:
    addu	v0, $0, a0
    addu	v1, a0, a1
1:
    slt		a3, v0, v1
    beq		a3, $0, 1f
    nop
    cache	Index_Invalidate_I, 0x0(v0) /* 1 way */
4:
    beq		$0, $0, 1b
    addiu	v0, v0, 0x20
1:
cache_flush_d2way:
    addu	v0, $0, a0
    addu	v1, a0, a2
1:	slt		a3, v0, v1
    beq		a3, $0, 1f
    nop
    cache	Index_Writeback_Inv_D, 0x0(v0) /* 1 way */
4:	beq		$0, $0, 1b
    addiu	v0, v0, 0x20

1:
cache_init_finish:
    jr	t1
    nop
#endif