1/* memset with AVX2
2 Copyright (C) 2014-2016 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <http://www.gnu.org/licenses/>. */
18
19#include <sysdep.h>
20
21#if IS_IN (libc)
22
23#include "asm-syntax.h"
24#ifndef MEMSET
25# define MEMSET __memset_avx2
26# define MEMSET_CHK __memset_chk_avx2
27#endif
28
29 .section .text.avx2,"ax",@progbits
30#if defined PIC
31ENTRY (MEMSET_CHK)
32 cmpq %rdx, %rcx
33 jb HIDDEN_JUMPTARGET (__chk_fail)
34END (MEMSET_CHK)
35#endif
36
37ENTRY (MEMSET)
38 vpxor %xmm0, %xmm0, %xmm0
39 vmovd %esi, %xmm1
40 lea (%rdi, %rdx), %rsi
41 mov %rdi, %rax
42 vpshufb %xmm0, %xmm1, %xmm0
43 cmp $16, %rdx
44 jb L(less_16bytes)
45 cmp $256, %rdx
46 jae L(256bytesormore)
47 cmp $128, %dl
48 jb L(less_128bytes)
49 vmovdqu %xmm0, (%rdi)
50 vmovdqu %xmm0, 0x10(%rdi)
51 vmovdqu %xmm0, 0x20(%rdi)
52 vmovdqu %xmm0, 0x30(%rdi)
53 vmovdqu %xmm0, 0x40(%rdi)
54 vmovdqu %xmm0, 0x50(%rdi)
55 vmovdqu %xmm0, 0x60(%rdi)
56 vmovdqu %xmm0, 0x70(%rdi)
57 vmovdqu %xmm0, -0x80(%rsi)
58 vmovdqu %xmm0, -0x70(%rsi)
59 vmovdqu %xmm0, -0x60(%rsi)
60 vmovdqu %xmm0, -0x50(%rsi)
61 vmovdqu %xmm0, -0x40(%rsi)
62 vmovdqu %xmm0, -0x30(%rsi)
63 vmovdqu %xmm0, -0x20(%rsi)
64 vmovdqu %xmm0, -0x10(%rsi)
65 ret
66
67 .p2align 4
68L(less_128bytes):
69 cmp $64, %dl
70 jb L(less_64bytes)
71 vmovdqu %xmm0, (%rdi)
72 vmovdqu %xmm0, 0x10(%rdi)
73 vmovdqu %xmm0, 0x20(%rdi)
74 vmovdqu %xmm0, 0x30(%rdi)
75 vmovdqu %xmm0, -0x40(%rsi)
76 vmovdqu %xmm0, -0x30(%rsi)
77 vmovdqu %xmm0, -0x20(%rsi)
78 vmovdqu %xmm0, -0x10(%rsi)
79 ret
80
81 .p2align 4
82L(less_64bytes):
83 cmp $32, %dl
84 jb L(less_32bytes)
85 vmovdqu %xmm0, (%rdi)
86 vmovdqu %xmm0, 0x10(%rdi)
87 vmovdqu %xmm0, -0x20(%rsi)
88 vmovdqu %xmm0, -0x10(%rsi)
89 ret
90
91 .p2align 4
92L(less_32bytes):
93 vmovdqu %xmm0, (%rdi)
94 vmovdqu %xmm0, -0x10(%rsi)
95 ret
96
97 .p2align 4
98L(less_16bytes):
99 cmp $8, %dl
100 jb L(less_8bytes)
101 vmovq %xmm0, (%rdi)
102 vmovq %xmm0, -0x08(%rsi)
103 ret
104
105 .p2align 4
106L(less_8bytes):
107 vmovd %xmm0, %ecx
108 cmp $4, %dl
109 jb L(less_4bytes)
110 mov %ecx, (%rdi)
111 mov %ecx, -0x04(%rsi)
112 ret
113
114 .p2align 4
115L(less_4bytes):
116 cmp $2, %dl
117 jb L(less_2bytes)
118 mov %cx, (%rdi)
119 mov %cx, -0x02(%rsi)
120 ret
121
122 .p2align 4
123L(less_2bytes):
124 cmp $1, %dl
125 jb L(less_1bytes)
126 mov %cl, (%rdi)
127L(less_1bytes):
128 ret
129
130 .p2align 4
131L(256bytesormore):
132 vinserti128 $1, %xmm0, %ymm0, %ymm0
133 and $-0x20, %rdi
134 add $0x20, %rdi
135 vmovdqu %ymm0, (%rax)
136 sub %rdi, %rax
137 lea -0x80(%rax, %rdx), %rcx
138 cmp $4096, %rcx
139 ja L(gobble_data)
140L(gobble_128_loop):
141 vmovdqa %ymm0, (%rdi)
142 vmovdqa %ymm0, 0x20(%rdi)
143 vmovdqa %ymm0, 0x40(%rdi)
144 vmovdqa %ymm0, 0x60(%rdi)
145 sub $-0x80, %rdi
146 add $-0x80, %ecx
147 jb L(gobble_128_loop)
148 mov %rsi, %rax
149 vmovdqu %ymm0, -0x80(%rsi)
150 vmovdqu %ymm0, -0x60(%rsi)
151 vmovdqu %ymm0, -0x40(%rsi)
152 vmovdqu %ymm0, -0x20(%rsi)
153 sub %rdx, %rax
154 vzeroupper
155 ret
156
157 .p2align 4
158L(gobble_data):
159 sub $-0x80, %rcx
160 vmovd %xmm0, %eax
161 rep stosb
162 mov %rsi, %rax
163 sub %rdx, %rax
164 vzeroupper
165 ret
166
167END (MEMSET)
168#endif
169