From 5e9377ec6f84e5334e9347e84e77d34e9a089ca7 Mon Sep 17 00:00:00 2001 From: Stuart Menefy Date: Mon, 24 Aug 2009 17:35:07 +0900 Subject: [PATCH] sh: Optimise memcpy_to/fromio for SH4 Optimise memcpy_to/fromio. This is used extensivly by MTD, so is a worthwhile performance gain. The main savings come from not repeatedly calling readl/writel, and doing word instead of byte at a time transfers. Also using "movca.l" on SH4 gives a small performance win. Signed-off-by: Stuart Menefy Signed-off-by: Paul Mundt --- arch/sh/kernel/io.c | 93 +++++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 72 insertions(+), 21 deletions(-) diff --git a/arch/sh/kernel/io.c b/arch/sh/kernel/io.c index d0ca968..4770c24 100644 --- a/arch/sh/kernel/io.c +++ b/arch/sh/kernel/io.c @@ -1,12 +1,9 @@ /* - * linux/arch/sh/kernel/io.c + * arch/sh/kernel/io.c - Machine independent I/O functions. * - * Copyright (C) 2000 Stuart Menefy + * Copyright (C) 2000 - 2009 Stuart Menefy * Copyright (C) 2005 Paul Mundt * - * Provide real functions which expand to whatever the header file defined. - * Also definitions of machine independent IO functions. - * * This file is subject to the terms and conditions of the GNU General Public * License. See the file "COPYING" in the main directory of this archive * for more details. @@ -18,33 +15,87 @@ /* * Copy data from IO memory space to "real" memory space. - * This needs to be optimized. */ void memcpy_fromio(void *to, const volatile void __iomem *from, unsigned long count) { - unsigned char *p = to; - while (count) { - count--; - *p = readb(from); - p++; - from++; - } + /* + * Would it be worthwhile doing byte and long transfers first + * to try and get aligned? + */ +#ifdef CONFIG_CPU_SH4 + if ((count >= 0x20) && + (((u32)to & 0x1f) == 0) && (((u32)from & 0x3) == 0)) { + int tmp2, tmp3, tmp4, tmp5, tmp6; + + __asm__ __volatile__( + "1: \n\t" + "mov.l @%7+, r0 \n\t" + "mov.l @%7+, %2 \n\t" + "movca.l r0, @%0 \n\t" + "mov.l @%7+, %3 \n\t" + "mov.l @%7+, %4 \n\t" + "mov.l @%7+, %5 \n\t" + "mov.l @%7+, %6 \n\t" + "mov.l @%7+, r7 \n\t" + "mov.l @%7+, r0 \n\t" + "mov.l %2, @(0x04,%0) \n\t" + "mov #0x20, %2 \n\t" + "mov.l %3, @(0x08,%0) \n\t" + "sub %2, %1 \n\t" + "mov.l %4, @(0x0c,%0) \n\t" + "cmp/hi %1, %2 ! T if 32 > count \n\t" + "mov.l %5, @(0x10,%0) \n\t" + "mov.l %6, @(0x14,%0) \n\t" + "mov.l r7, @(0x18,%0) \n\t" + "mov.l r0, @(0x1c,%0) \n\t" + "bf.s 1b \n\t" + " add #0x20, %0 \n\t" + : "=&r" (to), "=&r" (count), + "=&r" (tmp2), "=&r" (tmp3), "=&r" (tmp4), + "=&r" (tmp5), "=&r" (tmp6), "=&r" (from) + : "7"(from), "0" (to), "1" (count) + : "r0", "r7", "t", "memory"); + } +#endif + + if ((((u32)to | (u32)from) & 0x3) == 0) { + for (; count > 3; count -= 4) { + *(u32 *)to = *(volatile u32 *)from; + to += 4; + from += 4; + } + } + + for (; count > 0; count--) { + *(u8 *)to = *(volatile u8 *)from; + to++; + from++; + } + + mb(); } EXPORT_SYMBOL(memcpy_fromio); /* * Copy data from "real" memory space to IO memory space. - * This needs to be optimized. */ void memcpy_toio(volatile void __iomem *to, const void *from, unsigned long count) { - const unsigned char *p = from; - while (count) { - count--; - writeb(*p, to); - p++; - to++; - } + if ((((u32)to | (u32)from) & 0x3) == 0) { + for ( ; count > 3; count -= 4) { + *(volatile u32 *)to = *(u32 *)from; + to += 4; + from += 4; + } + } + + for (; count > 0; count--) { + *(volatile u8 *)to = *(u8 *)from; + to++; + from++; + } + + mb(); } EXPORT_SYMBOL(memcpy_toio); -- 2.7.4