[352] | 1 | #if !__ARMEB__ && !__thumb__
|
---|
| 2 |
|
---|
| 3 | /*
|
---|
| 4 | * Copyright (C) 2008 The Android Open Source Project
|
---|
| 5 | * All rights reserved.
|
---|
| 6 | *
|
---|
| 7 | * Redistribution and use in source and binary forms, with or without
|
---|
| 8 | * modification, are permitted provided that the following conditions
|
---|
| 9 | * are met:
|
---|
| 10 | * * Redistributions of source code must retain the above copyright
|
---|
| 11 | * notice, this list of conditions and the following disclaimer.
|
---|
| 12 | * * Redistributions in binary form must reproduce the above copyright
|
---|
| 13 | * notice, this list of conditions and the following disclaimer in
|
---|
| 14 | * the documentation and/or other materials provided with the
|
---|
| 15 | * distribution.
|
---|
| 16 | *
|
---|
| 17 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
---|
| 18 | * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
---|
| 19 | * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
|
---|
| 20 | * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
|
---|
| 21 | * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
|
---|
| 22 | * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
|
---|
| 23 | * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
|
---|
| 24 | * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
|
---|
| 25 | * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
---|
| 26 | * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
|
---|
| 27 | * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
---|
| 28 | * SUCH DAMAGE.
|
---|
| 29 | */
|
---|
| 30 |
|
---|
| 31 |
|
---|
| 32 | /*
|
---|
| 33 | * Optimized memcpy() for ARM.
|
---|
| 34 | *
|
---|
| 35 | * note that memcpy() always returns the destination pointer,
|
---|
| 36 | * so we have to preserve R0.
|
---|
| 37 | */
|
---|
| 38 |
|
---|
| 39 | /*
|
---|
| 40 | * This file has been modified from the original for use in musl libc.
|
---|
| 41 | * The main changes are: addition of .type memcpy,%function to make the
|
---|
| 42 | * code safely callable from thumb mode, adjusting the return
|
---|
| 43 | * instructions to be compatible with pre-thumb ARM cpus, and removal
|
---|
| 44 | * of prefetch code that is not compatible with older cpus.
|
---|
| 45 | */
|
---|
| 46 |
|
---|
| 47 | .syntax unified
|
---|
| 48 |
|
---|
| 49 | .global memcpy
|
---|
| 50 | .type memcpy,%function
|
---|
| 51 | memcpy:
|
---|
| 52 | /* The stack must always be 64-bits aligned to be compliant with the
|
---|
| 53 | * ARM ABI. Since we have to save R0, we might as well save R4
|
---|
| 54 | * which we can use for better pipelining of the reads below
|
---|
| 55 | */
|
---|
| 56 | .fnstart
|
---|
| 57 | .save {r0, r4, lr}
|
---|
| 58 | stmfd sp!, {r0, r4, lr}
|
---|
| 59 | /* Making room for r5-r11 which will be spilled later */
|
---|
| 60 | .pad #28
|
---|
| 61 | sub sp, sp, #28
|
---|
| 62 |
|
---|
| 63 | /* it simplifies things to take care of len<4 early */
|
---|
| 64 | cmp r2, #4
|
---|
| 65 | blo copy_last_3_and_return
|
---|
| 66 |
|
---|
| 67 | /* compute the offset to align the source
|
---|
| 68 | * offset = (4-(src&3))&3 = -src & 3
|
---|
| 69 | */
|
---|
| 70 | rsb r3, r1, #0
|
---|
| 71 | ands r3, r3, #3
|
---|
| 72 | beq src_aligned
|
---|
| 73 |
|
---|
| 74 | /* align source to 32 bits. We need to insert 2 instructions between
|
---|
| 75 | * a ldr[b|h] and str[b|h] because byte and half-word instructions
|
---|
| 76 | * stall 2 cycles.
|
---|
| 77 | */
|
---|
| 78 | movs r12, r3, lsl #31
|
---|
| 79 | sub r2, r2, r3 /* we know that r3 <= r2 because r2 >= 4 */
|
---|
| 80 | ldrbmi r3, [r1], #1
|
---|
| 81 | ldrbcs r4, [r1], #1
|
---|
| 82 | ldrbcs r12,[r1], #1
|
---|
| 83 | strbmi r3, [r0], #1
|
---|
| 84 | strbcs r4, [r0], #1
|
---|
| 85 | strbcs r12,[r0], #1
|
---|
| 86 |
|
---|
| 87 | src_aligned:
|
---|
| 88 |
|
---|
| 89 | /* see if src and dst are aligned together (congruent) */
|
---|
| 90 | eor r12, r0, r1
|
---|
| 91 | tst r12, #3
|
---|
| 92 | bne non_congruent
|
---|
| 93 |
|
---|
| 94 | /* Use post-incriment mode for stm to spill r5-r11 to reserved stack
|
---|
| 95 | * frame. Don't update sp.
|
---|
| 96 | */
|
---|
| 97 | stmea sp, {r5-r11}
|
---|
| 98 |
|
---|
| 99 | /* align the destination to a cache-line */
|
---|
| 100 | rsb r3, r0, #0
|
---|
| 101 | ands r3, r3, #0x1C
|
---|
| 102 | beq congruent_aligned32
|
---|
| 103 | cmp r3, r2
|
---|
| 104 | andhi r3, r2, #0x1C
|
---|
| 105 |
|
---|
| 106 | /* conditionnaly copies 0 to 7 words (length in r3) */
|
---|
| 107 | movs r12, r3, lsl #28
|
---|
| 108 | ldmcs r1!, {r4, r5, r6, r7} /* 16 bytes */
|
---|
| 109 | ldmmi r1!, {r8, r9} /* 8 bytes */
|
---|
| 110 | stmcs r0!, {r4, r5, r6, r7}
|
---|
| 111 | stmmi r0!, {r8, r9}
|
---|
| 112 | tst r3, #0x4
|
---|
| 113 | ldrne r10,[r1], #4 /* 4 bytes */
|
---|
| 114 | strne r10,[r0], #4
|
---|
| 115 | sub r2, r2, r3
|
---|
| 116 |
|
---|
| 117 | congruent_aligned32:
|
---|
| 118 | /*
|
---|
| 119 | * here source is aligned to 32 bytes.
|
---|
| 120 | */
|
---|
| 121 |
|
---|
| 122 | cached_aligned32:
|
---|
| 123 | subs r2, r2, #32
|
---|
| 124 | blo less_than_32_left
|
---|
| 125 |
|
---|
| 126 | /*
|
---|
| 127 | * We preload a cache-line up to 64 bytes ahead. On the 926, this will
|
---|
| 128 | * stall only until the requested world is fetched, but the linefill
|
---|
| 129 | * continues in the the background.
|
---|
| 130 | * While the linefill is going, we write our previous cache-line
|
---|
| 131 | * into the write-buffer (which should have some free space).
|
---|
| 132 | * When the linefill is done, the writebuffer will
|
---|
| 133 | * start dumping its content into memory
|
---|
| 134 | *
|
---|
| 135 | * While all this is going, we then load a full cache line into
|
---|
| 136 | * 8 registers, this cache line should be in the cache by now
|
---|
| 137 | * (or partly in the cache).
|
---|
| 138 | *
|
---|
| 139 | * This code should work well regardless of the source/dest alignment.
|
---|
| 140 | *
|
---|
| 141 | */
|
---|
| 142 |
|
---|
| 143 | /* Align the preload register to a cache-line because the cpu does
|
---|
| 144 | * "critical word first" (the first word requested is loaded first).
|
---|
| 145 | */
|
---|
| 146 | @ bic r12, r1, #0x1F
|
---|
| 147 | @ add r12, r12, #64
|
---|
| 148 |
|
---|
| 149 | 1: ldmia r1!, { r4-r11 }
|
---|
| 150 | subs r2, r2, #32
|
---|
| 151 |
|
---|
| 152 | /*
|
---|
| 153 | * NOTE: if r12 is more than 64 ahead of r1, the following ldrhi
|
---|
| 154 | * for ARM9 preload will not be safely guarded by the preceding subs.
|
---|
| 155 | * When it is safely guarded the only possibility to have SIGSEGV here
|
---|
| 156 | * is because the caller overstates the length.
|
---|
| 157 | */
|
---|
| 158 | @ ldrhi r3, [r12], #32 /* cheap ARM9 preload */
|
---|
| 159 | stmia r0!, { r4-r11 }
|
---|
| 160 | bhs 1b
|
---|
| 161 |
|
---|
| 162 | add r2, r2, #32
|
---|
| 163 |
|
---|
| 164 | less_than_32_left:
|
---|
| 165 | /*
|
---|
| 166 | * less than 32 bytes left at this point (length in r2)
|
---|
| 167 | */
|
---|
| 168 |
|
---|
| 169 | /* skip all this if there is nothing to do, which should
|
---|
| 170 | * be a common case (if not executed the code below takes
|
---|
| 171 | * about 16 cycles)
|
---|
| 172 | */
|
---|
| 173 | tst r2, #0x1F
|
---|
| 174 | beq 1f
|
---|
| 175 |
|
---|
| 176 | /* conditionnaly copies 0 to 31 bytes */
|
---|
| 177 | movs r12, r2, lsl #28
|
---|
| 178 | ldmcs r1!, {r4, r5, r6, r7} /* 16 bytes */
|
---|
| 179 | ldmmi r1!, {r8, r9} /* 8 bytes */
|
---|
| 180 | stmcs r0!, {r4, r5, r6, r7}
|
---|
| 181 | stmmi r0!, {r8, r9}
|
---|
| 182 | movs r12, r2, lsl #30
|
---|
| 183 | ldrcs r3, [r1], #4 /* 4 bytes */
|
---|
| 184 | ldrhmi r4, [r1], #2 /* 2 bytes */
|
---|
| 185 | strcs r3, [r0], #4
|
---|
| 186 | strhmi r4, [r0], #2
|
---|
| 187 | tst r2, #0x1
|
---|
| 188 | ldrbne r3, [r1] /* last byte */
|
---|
| 189 | strbne r3, [r0]
|
---|
| 190 |
|
---|
| 191 | /* we're done! restore everything and return */
|
---|
| 192 | 1: ldmfd sp!, {r5-r11}
|
---|
| 193 | ldmfd sp!, {r0, r4, lr}
|
---|
| 194 | bx lr
|
---|
| 195 |
|
---|
| 196 | /********************************************************************/
|
---|
| 197 |
|
---|
| 198 | non_congruent:
|
---|
| 199 | /*
|
---|
| 200 | * here source is aligned to 4 bytes
|
---|
| 201 | * but destination is not.
|
---|
| 202 | *
|
---|
| 203 | * in the code below r2 is the number of bytes read
|
---|
| 204 | * (the number of bytes written is always smaller, because we have
|
---|
| 205 | * partial words in the shift queue)
|
---|
| 206 | */
|
---|
| 207 | cmp r2, #4
|
---|
| 208 | blo copy_last_3_and_return
|
---|
| 209 |
|
---|
| 210 | /* Use post-incriment mode for stm to spill r5-r11 to reserved stack
|
---|
| 211 | * frame. Don't update sp.
|
---|
| 212 | */
|
---|
| 213 | stmea sp, {r5-r11}
|
---|
| 214 |
|
---|
| 215 | /* compute shifts needed to align src to dest */
|
---|
| 216 | rsb r5, r0, #0
|
---|
| 217 | and r5, r5, #3 /* r5 = # bytes in partial words */
|
---|
| 218 | mov r12, r5, lsl #3 /* r12 = right */
|
---|
| 219 | rsb lr, r12, #32 /* lr = left */
|
---|
| 220 |
|
---|
| 221 | /* read the first word */
|
---|
| 222 | ldr r3, [r1], #4
|
---|
| 223 | sub r2, r2, #4
|
---|
| 224 |
|
---|
| 225 | /* write a partial word (0 to 3 bytes), such that destination
|
---|
| 226 | * becomes aligned to 32 bits (r5 = nb of words to copy for alignment)
|
---|
| 227 | */
|
---|
| 228 | movs r5, r5, lsl #31
|
---|
| 229 | strbmi r3, [r0], #1
|
---|
| 230 | movmi r3, r3, lsr #8
|
---|
| 231 | strbcs r3, [r0], #1
|
---|
| 232 | movcs r3, r3, lsr #8
|
---|
| 233 | strbcs r3, [r0], #1
|
---|
| 234 | movcs r3, r3, lsr #8
|
---|
| 235 |
|
---|
| 236 | cmp r2, #4
|
---|
| 237 | blo partial_word_tail
|
---|
| 238 |
|
---|
| 239 | /* Align destination to 32 bytes (cache line boundary) */
|
---|
| 240 | 1: tst r0, #0x1c
|
---|
| 241 | beq 2f
|
---|
| 242 | ldr r5, [r1], #4
|
---|
| 243 | sub r2, r2, #4
|
---|
| 244 | orr r4, r3, r5, lsl lr
|
---|
| 245 | mov r3, r5, lsr r12
|
---|
| 246 | str r4, [r0], #4
|
---|
| 247 | cmp r2, #4
|
---|
| 248 | bhs 1b
|
---|
| 249 | blo partial_word_tail
|
---|
| 250 |
|
---|
| 251 | /* copy 32 bytes at a time */
|
---|
| 252 | 2: subs r2, r2, #32
|
---|
| 253 | blo less_than_thirtytwo
|
---|
| 254 |
|
---|
| 255 | /* Use immediate mode for the shifts, because there is an extra cycle
|
---|
| 256 | * for register shifts, which could account for up to 50% of
|
---|
| 257 | * performance hit.
|
---|
| 258 | */
|
---|
| 259 |
|
---|
| 260 | cmp r12, #24
|
---|
| 261 | beq loop24
|
---|
| 262 | cmp r12, #8
|
---|
| 263 | beq loop8
|
---|
| 264 |
|
---|
| 265 | loop16:
|
---|
| 266 | ldr r12, [r1], #4
|
---|
| 267 | 1: mov r4, r12
|
---|
| 268 | ldmia r1!, { r5,r6,r7, r8,r9,r10,r11}
|
---|
| 269 | subs r2, r2, #32
|
---|
| 270 | ldrhs r12, [r1], #4
|
---|
| 271 | orr r3, r3, r4, lsl #16
|
---|
| 272 | mov r4, r4, lsr #16
|
---|
| 273 | orr r4, r4, r5, lsl #16
|
---|
| 274 | mov r5, r5, lsr #16
|
---|
| 275 | orr r5, r5, r6, lsl #16
|
---|
| 276 | mov r6, r6, lsr #16
|
---|
| 277 | orr r6, r6, r7, lsl #16
|
---|
| 278 | mov r7, r7, lsr #16
|
---|
| 279 | orr r7, r7, r8, lsl #16
|
---|
| 280 | mov r8, r8, lsr #16
|
---|
| 281 | orr r8, r8, r9, lsl #16
|
---|
| 282 | mov r9, r9, lsr #16
|
---|
| 283 | orr r9, r9, r10, lsl #16
|
---|
| 284 | mov r10, r10, lsr #16
|
---|
| 285 | orr r10, r10, r11, lsl #16
|
---|
| 286 | stmia r0!, {r3,r4,r5,r6, r7,r8,r9,r10}
|
---|
| 287 | mov r3, r11, lsr #16
|
---|
| 288 | bhs 1b
|
---|
| 289 | b less_than_thirtytwo
|
---|
| 290 |
|
---|
| 291 | loop8:
|
---|
| 292 | ldr r12, [r1], #4
|
---|
| 293 | 1: mov r4, r12
|
---|
| 294 | ldmia r1!, { r5,r6,r7, r8,r9,r10,r11}
|
---|
| 295 | subs r2, r2, #32
|
---|
| 296 | ldrhs r12, [r1], #4
|
---|
| 297 | orr r3, r3, r4, lsl #24
|
---|
| 298 | mov r4, r4, lsr #8
|
---|
| 299 | orr r4, r4, r5, lsl #24
|
---|
| 300 | mov r5, r5, lsr #8
|
---|
| 301 | orr r5, r5, r6, lsl #24
|
---|
| 302 | mov r6, r6, lsr #8
|
---|
| 303 | orr r6, r6, r7, lsl #24
|
---|
| 304 | mov r7, r7, lsr #8
|
---|
| 305 | orr r7, r7, r8, lsl #24
|
---|
| 306 | mov r8, r8, lsr #8
|
---|
| 307 | orr r8, r8, r9, lsl #24
|
---|
| 308 | mov r9, r9, lsr #8
|
---|
| 309 | orr r9, r9, r10, lsl #24
|
---|
| 310 | mov r10, r10, lsr #8
|
---|
| 311 | orr r10, r10, r11, lsl #24
|
---|
| 312 | stmia r0!, {r3,r4,r5,r6, r7,r8,r9,r10}
|
---|
| 313 | mov r3, r11, lsr #8
|
---|
| 314 | bhs 1b
|
---|
| 315 | b less_than_thirtytwo
|
---|
| 316 |
|
---|
| 317 | loop24:
|
---|
| 318 | ldr r12, [r1], #4
|
---|
| 319 | 1: mov r4, r12
|
---|
| 320 | ldmia r1!, { r5,r6,r7, r8,r9,r10,r11}
|
---|
| 321 | subs r2, r2, #32
|
---|
| 322 | ldrhs r12, [r1], #4
|
---|
| 323 | orr r3, r3, r4, lsl #8
|
---|
| 324 | mov r4, r4, lsr #24
|
---|
| 325 | orr r4, r4, r5, lsl #8
|
---|
| 326 | mov r5, r5, lsr #24
|
---|
| 327 | orr r5, r5, r6, lsl #8
|
---|
| 328 | mov r6, r6, lsr #24
|
---|
| 329 | orr r6, r6, r7, lsl #8
|
---|
| 330 | mov r7, r7, lsr #24
|
---|
| 331 | orr r7, r7, r8, lsl #8
|
---|
| 332 | mov r8, r8, lsr #24
|
---|
| 333 | orr r8, r8, r9, lsl #8
|
---|
| 334 | mov r9, r9, lsr #24
|
---|
| 335 | orr r9, r9, r10, lsl #8
|
---|
| 336 | mov r10, r10, lsr #24
|
---|
| 337 | orr r10, r10, r11, lsl #8
|
---|
| 338 | stmia r0!, {r3,r4,r5,r6, r7,r8,r9,r10}
|
---|
| 339 | mov r3, r11, lsr #24
|
---|
| 340 | bhs 1b
|
---|
| 341 |
|
---|
| 342 | less_than_thirtytwo:
|
---|
| 343 | /* copy the last 0 to 31 bytes of the source */
|
---|
| 344 | rsb r12, lr, #32 /* we corrupted r12, recompute it */
|
---|
| 345 | add r2, r2, #32
|
---|
| 346 | cmp r2, #4
|
---|
| 347 | blo partial_word_tail
|
---|
| 348 |
|
---|
| 349 | 1: ldr r5, [r1], #4
|
---|
| 350 | sub r2, r2, #4
|
---|
| 351 | orr r4, r3, r5, lsl lr
|
---|
| 352 | mov r3, r5, lsr r12
|
---|
| 353 | str r4, [r0], #4
|
---|
| 354 | cmp r2, #4
|
---|
| 355 | bhs 1b
|
---|
| 356 |
|
---|
| 357 | partial_word_tail:
|
---|
| 358 | /* we have a partial word in the input buffer */
|
---|
| 359 | movs r5, lr, lsl #(31-3)
|
---|
| 360 | strbmi r3, [r0], #1
|
---|
| 361 | movmi r3, r3, lsr #8
|
---|
| 362 | strbcs r3, [r0], #1
|
---|
| 363 | movcs r3, r3, lsr #8
|
---|
| 364 | strbcs r3, [r0], #1
|
---|
| 365 |
|
---|
| 366 | /* Refill spilled registers from the stack. Don't update sp. */
|
---|
| 367 | ldmfd sp, {r5-r11}
|
---|
| 368 |
|
---|
| 369 | copy_last_3_and_return:
|
---|
| 370 | movs r2, r2, lsl #31 /* copy remaining 0, 1, 2 or 3 bytes */
|
---|
| 371 | ldrbmi r2, [r1], #1
|
---|
| 372 | ldrbcs r3, [r1], #1
|
---|
| 373 | ldrbcs r12,[r1]
|
---|
| 374 | strbmi r2, [r0], #1
|
---|
| 375 | strbcs r3, [r0], #1
|
---|
| 376 | strbcs r12,[r0]
|
---|
| 377 |
|
---|
| 378 | /* we're done! restore sp and spilled registers and return */
|
---|
| 379 | add sp, sp, #28
|
---|
| 380 | ldmfd sp!, {r0, r4, lr}
|
---|
| 381 | bx lr
|
---|
| 382 |
|
---|
| 383 | #endif
|
---|