summaryrefslogtreecommitdiffstats
path: root/patches/source/glibc/glibc-2.15_avx2.diff
blob: 1b1b88be9f8b9a38cfeca2232a46a7e7842c2fb2 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
From: Ulrich Drepper <drepper@gmail.com>
Date: Thu, 26 Jan 2012 14:45:54 +0000 (-0500)
Subject: Really fix AVX tests
X-Git-Tag: glibc-2.16-tps~1052
X-Git-Url: http://sources.redhat.com/git/gitweb.cgi?p=glibc.git;a=commitdiff_plain;h=08cf777f9e7f6d826658a99c7d77a359f73a45bf

Really fix AVX tests

There is no problem with strcmp, it doesn't use the YMM registers.
The math routines might since gcc perhaps generates such code.
Introduce bit_YMM_USBALE and use it in the math routines.
---

--- sysdeps/x86_64/fpu/multiarch/e_atan2.c
+++ sysdeps/x86_64/fpu/multiarch/e_atan2.c
@@ -14,7 +14,7 @@ extern double __ieee754_atan2_fma4 (double, double);
 
 libm_ifunc (__ieee754_atan2,
 	    HAS_FMA4 ? __ieee754_atan2_fma4
-	    : (HAS_AVX ? __ieee754_atan2_avx : __ieee754_atan2_sse2));
+	    : (HAS_YMM_USABLE ? __ieee754_atan2_avx : __ieee754_atan2_sse2));
 strong_alias (__ieee754_atan2, __atan2_finite)
 
 # define __ieee754_atan2 __ieee754_atan2_sse2
--- sysdeps/x86_64/fpu/multiarch/e_exp.c
+++ sysdeps/x86_64/fpu/multiarch/e_exp.c
@@ -14,7 +14,7 @@ extern double __ieee754_exp_fma4 (double);
 
 libm_ifunc (__ieee754_exp,
 	    HAS_FMA4 ? __ieee754_exp_fma4
-	    : (HAS_AVX ? __ieee754_exp_avx : __ieee754_exp_sse2));
+	    : (HAS_YMM_USABLE ? __ieee754_exp_avx : __ieee754_exp_sse2));
 strong_alias (__ieee754_exp, __exp_finite)
 
 # define __ieee754_exp __ieee754_exp_sse2
--- sysdeps/x86_64/fpu/multiarch/e_log.c
+++ sysdeps/x86_64/fpu/multiarch/e_log.c
@@ -14,7 +14,7 @@ extern double __ieee754_log_fma4 (double);
 
 libm_ifunc (__ieee754_log,
 	    HAS_FMA4 ? __ieee754_log_fma4
-	    : (HAS_AVX ? __ieee754_log_avx
+	    : (HAS_YMM_USABLE ? __ieee754_log_avx
 	       : __ieee754_log_sse2));
 strong_alias (__ieee754_log, __log_finite)
 
--- sysdeps/x86_64/fpu/multiarch/s_atan.c
+++ sysdeps/x86_64/fpu/multiarch/s_atan.c
@@ -12,7 +12,8 @@ extern double __atan_fma4 (double);
 #  define __atan_fma4 ((void *) 0)
 # endif
 
-libm_ifunc (atan, HAS_FMA4 ? __atan_fma4 : HAS_AVX ? __atan_avx : __atan_sse2);
+libm_ifunc (atan, (HAS_FMA4 ? __atan_fma4 :
+		   HAS_YMM_USABLE ? __atan_avx : __atan_sse2));
 
 # define atan __atan_sse2
 #endif
--- sysdeps/x86_64/fpu/multiarch/s_sin.c
+++ sysdeps/x86_64/fpu/multiarch/s_sin.c
@@ -17,10 +17,12 @@ extern double __sin_fma4 (double);
 #  define __sin_fma4 ((void *) 0)
 # endif
 
-libm_ifunc (__cos, HAS_FMA4 ? __cos_fma4 : HAS_AVX ? __cos_avx : __cos_sse2);
+libm_ifunc (__cos, (HAS_FMA4 ? __cos_fma4 :
+		    HAS_YMM_USABLE ? __cos_avx : __cos_sse2));
 weak_alias (__cos, cos)
 
-libm_ifunc (__sin, HAS_FMA4 ? __sin_fma4 : HAS_AVX ? __sin_avx : __sin_sse2);
+libm_ifunc (__sin, (HAS_FMA4 ? __sin_fma4 :
+		    HAS_YMM_USABLE ? __sin_avx : __sin_sse2));
 weak_alias (__sin, sin)
 
 # define __cos __cos_sse2
--- sysdeps/x86_64/fpu/multiarch/s_tan.c
+++ sysdeps/x86_64/fpu/multiarch/s_tan.c
@@ -12,7 +12,8 @@ extern double __tan_fma4 (double);
 #  define __tan_fma4 ((void *) 0)
 # endif
 
-libm_ifunc (tan, HAS_FMA4 ? __tan_fma4 : HAS_AVX ? __tan_avx : __tan_sse2);
+libm_ifunc (tan, (HAS_FMA4 ? __tan_fma4 :
+		  HAS_YMM_USABLE ? __tan_avx : __tan_sse2));
 
 # define tan __tan_sse2
 #endif
--- sysdeps/x86_64/multiarch/init-arch.c
+++ sysdeps/x86_64/multiarch/init-arch.c
@@ -147,13 +147,13 @@ __init_cpu_features (void)
   if (__cpu_features.cpuid[COMMON_CPUID_INDEX_1].ecx & bit_AVX)
     {
       /* Reset the AVX bit in case OSXSAVE is disabled.  */
-      if ((__cpu_features.cpuid[COMMON_CPUID_INDEX_1].ecx & bit_OSXSAVE) == 0
-	  || ({ unsigned int xcrlow;
-	      unsigned int xcrhigh;
-	      asm ("xgetbv"
-		   : "=a" (xcrlow), "=d" (xcrhigh) : "c" (0));
-	      (xcrlow & 6) != 6; }))
-	__cpu_features.cpuid[COMMON_CPUID_INDEX_1].ecx &= ~bit_AVX;
+      if ((__cpu_features.cpuid[COMMON_CPUID_INDEX_1].ecx & bit_OSXSAVE) != 0
+	  && ({ unsigned int xcrlow;
+		unsigned int xcrhigh;
+		asm ("xgetbv"
+		     : "=a" (xcrlow), "=d" (xcrhigh) : "c" (0));
+		(xcrlow & 6) == 6; }))
+	__cpu_features.feature[index_YMM_Usable] |= bit_YMM_Usable;
     }
 
   __cpu_features.family = family;
--- sysdeps/x86_64/multiarch/init-arch.h
+++ sysdeps/x86_64/multiarch/init-arch.h
@@ -22,6 +22,7 @@
 #define bit_Prefer_SSE_for_memop	(1 << 3)
 #define bit_Fast_Unaligned_Load		(1 << 4)
 #define bit_Prefer_PMINUB_for_stringop	(1 << 5)
+#define bit_YMM_Usable			(1 << 6)
 
 #define bit_SSE2	(1 << 26)
 #define bit_SSSE3	(1 << 9)
@@ -49,6 +50,7 @@
 # define index_Prefer_SSE_for_memop	FEATURE_INDEX_1*FEATURE_SIZE
 # define index_Fast_Unaligned_Load	FEATURE_INDEX_1*FEATURE_SIZE
 # define index_Prefer_PMINUB_for_stringop FEATURE_INDEX_1*FEATURE_SIZE
+# define index_YMM_Usable		FEATURE_INDEX_1*FEATURE_SIZE
 
 #else	/* __ASSEMBLER__ */
 
@@ -93,7 +95,7 @@ extern struct cpu_features
 
 
 extern void __init_cpu_features (void) attribute_hidden;
-#define INIT_ARCH()\
+# define INIT_ARCH() \
   do							\
     if (__cpu_features.kind == arch_kind_unknown)	\
       __init_cpu_features ();				\
@@ -126,23 +128,21 @@ extern const struct cpu_features *__get_cpu_features (void)
 # define index_Slow_BSF			FEATURE_INDEX_1
 # define index_Prefer_SSE_for_memop	FEATURE_INDEX_1
 # define index_Fast_Unaligned_Load	FEATURE_INDEX_1
+# define index_YMM_Usable		FEATURE_INDEX_1
 
-#define HAS_ARCH_FEATURE(idx, bit) \
-  ((__get_cpu_features ()->feature[idx] & (bit)) != 0)
+# define HAS_ARCH_FEATURE(name) \
+  ((__get_cpu_features ()->feature[index_##name] & (bit_##name)) != 0)
 
-#define HAS_FAST_REP_STRING \
-  HAS_ARCH_FEATURE (index_Fast_Rep_String, bit_Fast_Rep_String)
+# define HAS_FAST_REP_STRING	HAS_ARCH_FEATURE (Fast_Rep_String)
 
-#define HAS_FAST_COPY_BACKWARD \
-  HAS_ARCH_FEATURE (index_Fast_Copy_Backward, bit_Fast_Copy_Backward)
+# define HAS_FAST_COPY_BACKWARD	HAS_ARCH_FEATURE (Fast_Copy_Backward)
 
-#define HAS_SLOW_BSF \
-  HAS_ARCH_FEATURE (index_Slow_BSF, bit_Slow_BSF)
+# define HAS_SLOW_BSF		HAS_ARCH_FEATURE (Slow_BSF)
 
-#define HAS_PREFER_SSE_FOR_MEMOP \
-  HAS_ARCH_FEATURE (index_Prefer_SSE_for_memop, bit_Prefer_SSE_for_memop)
+# define HAS_PREFER_SSE_FOR_MEMOP HAS_ARCH_FEATURE (Prefer_SSE_for_memop)
 
-#define HAS_FAST_UNALIGNED_LOAD \
-  HAS_ARCH_FEATURE (index_Fast_Unaligned_Load, bit_Fast_Unaligned_Load)
+# define HAS_FAST_UNALIGNED_LOAD HAS_ARCH_FEATURE (Fast_Unaligned_Load)
+
+# define HAS_YMM_USABLE		HAS_ARCH_FEATURE (YMM_Usable)
 
 #endif	/* __ASSEMBLER__ */