mirror of
				https://source.denx.de/u-boot/u-boot.git
				synced 2025-10-25 06:21:47 +02:00 
			
		
		
		
	At present U-Boot SPL fails to boot on SiFive Unleashed board, due to a load address misaligned exception happens when loading the FIT image in spl_load_simple_fit(). The exception happens in memmove() which is called by fdt_splice_(). Commit 8f0dc4cfd106 introduces an assembly version of memmove but it does take misalignment into account (it checks if length is a multiple of machine word size but pointers need also be aligned). As a result it will generate misaligned load/store for the majority of cases and causes significant performance regression on hardware that traps misaligned load/store and emulate them using firmware. The current behaviour of memcpy is that it checks if both src and dest pointers are co-aligned (aka congruent modular SZ_REG). If aligned, it will copy data word-by-word after first aligning pointers to word boundary. If src and dst are not co-aligned, however, byte-wise copy will be performed. This patch was taken from the Linux kernel patch [1], which has not been applied at the time being. It fixes the memmove and optimises memcpy for misaligned cases. It will first align destination pointer to word-boundary regardless whether src and dest are co-aligned or not. If they indeed are, then wordwise copy is performed. If they are not co-aligned, then it will load two adjacent words from src and use shifts to assemble a full machine word. Some additional assembly level micro-optimisation is also performed to ensure more instructions can be compressed (e.g. prefer a0 to t6). With this patch, U-Boot boots again on SiFive Unleashed board. [1] https://patchwork.kernel.org/project/linux-riscv/patch/20210216225555.4976-1-gary@garyguo.net/ Fixes: 8f0dc4cfd106 ("riscv: assembler versions of memcpy, memmove, memset") Signed-off-by: Bin Meng <bmeng.cn@gmail.com>
		
			
				
	
	
		
			160 lines
		
	
	
		
			3.3 KiB
		
	
	
	
		
			ArmAsm
		
	
	
	
	
	
			
		
		
	
	
			160 lines
		
	
	
		
			3.3 KiB
		
	
	
	
		
			ArmAsm
		
	
	
	
	
	
| /* SPDX-License-Identifier: GPL-2.0-only */
 | |
| /*
 | |
|  * Copyright (C) 2013 Regents of the University of California
 | |
|  */
 | |
| 
 | |
| #include <linux/linkage.h>
 | |
| #include <asm/asm.h>
 | |
| 
 | |
| /* void *memcpy(void *, const void *, size_t) */
 | |
| ENTRY(__memcpy)
 | |
| WEAK(memcpy)
 | |
| 	/* Save for return value */
 | |
| 	mv	t6, a0
 | |
| 
 | |
| 	/*
 | |
| 	 * Register allocation for code below:
 | |
| 	 * a0 - start of uncopied dst
 | |
| 	 * a1 - start of uncopied src
 | |
| 	 * t0 - end of uncopied dst
 | |
| 	 */
 | |
| 	add	t0, a0, a2
 | |
| 
 | |
| 	/*
 | |
| 	 * Use bytewise copy if too small.
 | |
| 	 *
 | |
| 	 * This threshold must be at least 2*SZREG to ensure at least one
 | |
| 	 * wordwise copy is performed. It is chosen to be 16 because it will
 | |
| 	 * save at least 7 iterations of bytewise copy, which pays off the
 | |
| 	 * fixed overhead.
 | |
| 	 */
 | |
| 	li	a3, 16
 | |
| 	bltu	a2, a3, .Lbyte_copy_tail
 | |
| 
 | |
| 	/*
 | |
| 	 * Bytewise copy first to align a0 to word boundary.
 | |
| 	 */
 | |
| 	addi	a2, a0, SZREG-1
 | |
| 	andi	a2, a2, ~(SZREG-1)
 | |
| 	beq	a0, a2, 2f
 | |
| 1:
 | |
| 	lb	a5, 0(a1)
 | |
| 	addi	a1, a1, 1
 | |
| 	sb	a5, 0(a0)
 | |
| 	addi	a0, a0, 1
 | |
| 	bne	a0, a2, 1b
 | |
| 2:
 | |
| 
 | |
| 	/*
 | |
| 	 * Now a0 is word-aligned. If a1 is also word aligned, we could perform
 | |
| 	 * aligned word-wise copy. Otherwise we need to perform misaligned
 | |
| 	 * word-wise copy.
 | |
| 	 */
 | |
| 	andi	a3, a1, SZREG-1
 | |
| 	bnez	a3, .Lmisaligned_word_copy
 | |
| 
 | |
| 	/* Unrolled wordwise copy */
 | |
| 	addi	t0, t0, -(16*SZREG-1)
 | |
| 	bgeu	a0, t0, 2f
 | |
| 1:
 | |
| 	REG_L	a2,        0(a1)
 | |
| 	REG_L	a3,    SZREG(a1)
 | |
| 	REG_L	a4,  2*SZREG(a1)
 | |
| 	REG_L	a5,  3*SZREG(a1)
 | |
| 	REG_L	a6,  4*SZREG(a1)
 | |
| 	REG_L	a7,  5*SZREG(a1)
 | |
| 	REG_L	t1,  6*SZREG(a1)
 | |
| 	REG_L	t2,  7*SZREG(a1)
 | |
| 	REG_L	t3,  8*SZREG(a1)
 | |
| 	REG_L	t4,  9*SZREG(a1)
 | |
| 	REG_L	t5, 10*SZREG(a1)
 | |
| 	REG_S	a2,        0(a0)
 | |
| 	REG_S	a3,    SZREG(a0)
 | |
| 	REG_S	a4,  2*SZREG(a0)
 | |
| 	REG_S	a5,  3*SZREG(a0)
 | |
| 	REG_S	a6,  4*SZREG(a0)
 | |
| 	REG_S	a7,  5*SZREG(a0)
 | |
| 	REG_S	t1,  6*SZREG(a0)
 | |
| 	REG_S	t2,  7*SZREG(a0)
 | |
| 	REG_S	t3,  8*SZREG(a0)
 | |
| 	REG_S	t4,  9*SZREG(a0)
 | |
| 	REG_S	t5, 10*SZREG(a0)
 | |
| 	REG_L	a2, 11*SZREG(a1)
 | |
| 	REG_L	a3, 12*SZREG(a1)
 | |
| 	REG_L	a4, 13*SZREG(a1)
 | |
| 	REG_L	a5, 14*SZREG(a1)
 | |
| 	REG_L	a6, 15*SZREG(a1)
 | |
| 	addi	a1, a1, 16*SZREG
 | |
| 	REG_S	a2, 11*SZREG(a0)
 | |
| 	REG_S	a3, 12*SZREG(a0)
 | |
| 	REG_S	a4, 13*SZREG(a0)
 | |
| 	REG_S	a5, 14*SZREG(a0)
 | |
| 	REG_S	a6, 15*SZREG(a0)
 | |
| 	addi	a0, a0, 16*SZREG
 | |
| 	bltu	a0, t0, 1b
 | |
| 2:
 | |
| 	/* Post-loop increment by 16*SZREG-1 and pre-loop decrement by SZREG-1 */
 | |
| 	addi	t0, t0, 15*SZREG
 | |
| 
 | |
| 	/* Wordwise copy */
 | |
| 	bgeu	a0, t0, 2f
 | |
| 1:
 | |
| 	REG_L	a5, 0(a1)
 | |
| 	addi	a1, a1, SZREG
 | |
| 	REG_S	a5, 0(a0)
 | |
| 	addi	a0, a0, SZREG
 | |
| 	bltu	a0, t0, 1b
 | |
| 2:
 | |
| 	addi	t0, t0, SZREG-1
 | |
| 
 | |
| .Lbyte_copy_tail:
 | |
| 	/*
 | |
| 	 * Bytewise copy anything left.
 | |
| 	 */
 | |
| 	beq	a0, t0, 2f
 | |
| 1:
 | |
| 	lb	a5, 0(a1)
 | |
| 	addi	a1, a1, 1
 | |
| 	sb	a5, 0(a0)
 | |
| 	addi	a0, a0, 1
 | |
| 	bne	a0, t0, 1b
 | |
| 2:
 | |
| 
 | |
| 	mv	a0, t6
 | |
| 	ret
 | |
| 
 | |
| .Lmisaligned_word_copy:
 | |
| 	/*
 | |
| 	 * Misaligned word-wise copy.
 | |
| 	 * For misaligned copy we still perform word-wise copy, but we need to
 | |
| 	 * use the value fetched from the previous iteration and do some shifts.
 | |
| 	 * This is safe because we wouldn't access more words than necessary.
 | |
| 	 */
 | |
| 
 | |
| 	/* Calculate shifts */
 | |
| 	slli	t3, a3, 3
 | |
| 	sub	t4, x0, t3 /* negate is okay as shift will only look at LSBs */
 | |
| 
 | |
| 	/* Load the initial value and align a1 */
 | |
| 	andi	a1, a1, ~(SZREG-1)
 | |
| 	REG_L	a5, 0(a1)
 | |
| 
 | |
| 	addi	t0, t0, -(SZREG-1)
 | |
| 	/* At least one iteration will be executed here, no check */
 | |
| 1:
 | |
| 	srl	a4, a5, t3
 | |
| 	REG_L	a5, SZREG(a1)
 | |
| 	addi	a1, a1, SZREG
 | |
| 	sll	a2, a5, t4
 | |
| 	or	a2, a2, a4
 | |
| 	REG_S	a2, 0(a0)
 | |
| 	addi	a0, a0, SZREG
 | |
| 	bltu	a0, t0, 1b
 | |
| 
 | |
| 	/* Update pointers to correct value */
 | |
| 	addi	t0, t0, SZREG-1
 | |
| 	add	a1, a1, a3
 | |
| 
 | |
| 	j	.Lbyte_copy_tail
 | |
| END(__memcpy)
 |