-
Notifications
You must be signed in to change notification settings - Fork 21
/
Copy pathsimd_utils_constants.h
2838 lines (2460 loc) · 104 KB
/
simd_utils_constants.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
/*
* Project : SIMD_Utils
* Version : 0.2.6
* Author : JishinMaster
* Licence : BSD-2
*/
#if defined(SSE) || defined(AVX) || defined(AVX512)
#ifndef ARM
#include <immintrin.h>
#else
#if !defined(__aarch64__)
#define SSE2NEON_PRECISE_SQRT 1
#define SSE2NEON_PRECISE_DIV 1
#endif
// Also includes arm_neon.h
#include "sse2neon_wrapper.h"
#endif
#endif
#ifdef RISCV
#include <riscv_vector.h>
/* ELEN : element length, 8,16,32,64bits
VLEN : Vector Length, at least 128bits
32 registers in the 0.10 standard, plus vstart, vxsat, vxrm, vcsr, vtype, vl, vlenb
VSEW : Vector Standard Element Width (dynamic), with of the base element : 8,16,32,64,...,1024bits
(up to 64bit in the current intrinsics
LMUL : Vector register grouping => may group multiple VLEN registers, so that 1 instruction can be applied to multiple registers. If LMUL is < 1, the operation applies only to a part of the register
LMUL = 1,2,4,8, 1, 1/2, 1/4, 1/8
VLMAX = LMUL*VLEN/SEW
Vector Tail Agnostic and Vector Mask Agnostic vta and vma allow to mask operations on vector such as only part of a vector is modified
Vector Fixed-Point Rounding Mode Register vxrm for rounding mode : round-to-nearest-up rnu, round-to-nearest-even rne, round-down rdn, round-to-odd rod
Need a real CPU with CPI/latency to have better choice of instructions..
fmadd vs fmacc, load stride vs segment, etc
*/
// 0 to nearest, 1 to zero (trunc), 2 round down, 3 round up, 4 round to nearest
#define _MM_ROUND_NEAREST 0
#define _MM_ROUND_TOWARD_ZERO 1
#define _MM_ROUND_DOWN 2
#define _MM_ROUND_UP 3
#define _MM_ROUND_AWAY 4
// load vector float32, 8
// "1" in name means either vector scalar instructions, or load/store scalar to vector
/*
# FP multiply-accumulate, overwrites addend
vfmacc.vv vd, vs1, vs2, vm
# vd[i] = +(vs1[i] * vs2[i]) + vd[i]
vfmacc.vf vd, rs1, vs2, vm
# vd[i] = +(f[rs1] * vs2[i]) + vd[i]
# FP negate-(multiply-accumulate), overwrites subtrahend
vfnmacc.vv vd, vs1, vs2, vm
# vd[i] = -(vs1[i] * vs2[i]) - vd[i]
vfnmacc.vf vd, rs1, vs2, vm
# vd[i] = -(f[rs1] * vs2[i]) - vd[i]
# FP multiply-subtract-accumulator, overwrites subtrahend
vfmsac.vv vd, vs1, vs2, vm
# vd[i] = +(vs1[i] * vs2[i]) - vd[i]
vfmsac.vf vd, rs1, vs2, vm
# vd[i] = +(f[rs1] * vs2[i]) - vd[i]
# FP negate-(multiply-subtract-accumulator), overwrites minuend
vfnmsac.vv vd, vs1, vs2, vm
# vd[i] = -(vs1[i] * vs2[i]) + vd[i]
vfnmsac.vf vd, rs1, vs2, vm
# vd[i] = -(f[rs1] * vs2[i]) + vd[i]
# FP multiply-add, overwrites multiplicand
vfmadd.vv vd, vs1, vs2, vm
# vd[i] = +(vs1[i] * vd[i]) + vs2[i]
vfmadd.vf vd, rs1, vs2, vm
# vd[i] = +(f[rs1] * vd[i]) + vs2[i]
# FP negate-(multiply-add), overwrites multiplicand
vfnmadd.vv vd, vs1, vs2, vm
# vd[i] = -(vs1[i] * vd[i]) - vs2[i]
vfnmadd.vf vd, rs1, vs2, vm
# vd[i] = -(f[rs1] * vd[i]) - vs2[i]
# FP multiply-sub, overwrites multiplicand
vfmsub.vv vd, vs1, vs2, vm
# vd[i] = +(vs1[i] * vd[i]) - vs2[i]
vfmsub.vf vd, rs1, vs2, vm
# vd[i] = +(f[rs1] * vd[i]) - vs2[i]
# FP negate-(multiply-sub), overwrites multiplicand
vfnmsub.vv vd, vs1, vs2, vm
# vd[i] = -(vs1[i] * vd[i]) + vs2[i]
vfnmsub.vf vd, rs1, vs2, vm
# vd[i] = -(f[rs1] * vd[i]) + vs2[i]
*/
#ifndef ELEN
#define ELEN 64 // vector support elements up to 64 bits
#endif
#ifndef VECTOR_LENGTH
#define MAX_ELTS8 1024 // 1024bits*4 registers(m4) => 512 int8
#define MAX_ELTS32 256 // 1024bits*4 registers(m4) => 128 float/int32
#define MAX_ELTS64 128 // 1024bits*4 registers(m4) => 64 double/int64
#define VECTOR_LENGTH 1024
#else
#define MAX_ELTS8 VECTOR_LENGTH
#define MAX_ELTS32 VECTOR_LENGTH / 4
#define MAX_ELTS64 VECTOR_LENGTH / 8
#endif
#if 1 // else in work in progress, for recent GCC and clang
#define NO_RTZ
#define vfcvt_rtz_x_f_v_i32m4 vfcvt_x_f_v_i32m4
#define vfcvt_rtz_x_f_v_i32m2 vfcvt_x_f_v_i32m2
///////////////////// FULL VECTOR m4 //////////////
#define VSETVL32 vsetvl_e32m4
#define VSETVL16 vsetvl_e16m4
//// FLOAT
#define V_ELT_FLOAT vfloat32m4_t
#define VLOAD_FLOAT vle32_v_f32m4
#define VLOAD1_FLOAT vfmv_v_f_f32m4
#define VSTORE_FLOAT vse32_v_f32m4
#define VADD_FLOAT vfadd_vv_f32m4
#define VADD1_FLOAT vfadd_vf_f32m4
#define VSUB_FLOAT vfsub_vv_f32m4
#define VSUB1_FLOAT vfsub_vf_f32m4
#define VRSUB1_FLOAT vfrsub_vf_f32m4 // v2 = f - v1
#define VMUL_FLOAT vfmul_vv_f32m4
#define VMUL1_FLOAT vfmul_vf_f32m4
#define VDIV_FLOAT vfdiv_vv_f32m4
#define VDIV1_FLOAT vfdiv_vf_f32m4
#define VRDIV1_FLOAT vfrdiv_vf_f32m4
#define VFMACC_FLOAT vfmacc_vv_f32m4 // vd[i] = +(vs1[i] * vs2[i]) + vd[i]
#define VFMACC1_FLOAT vfmacc_vf_f32m4
#define VFMADD_FLOAT vfmadd_vv_f32m4 // vd[i] = +(vs1[i] * vd[i]) + vs2[i]
#define VFMADD1_FLOAT vfmadd_vf_f32m4
#define VFMSUB_FLOAT vfmsub_vv_f32m4 // d = a*b - c
#define VREDSUM_FLOAT vfredosum_vs_f32m4_f32m1
#define VREDMAX_FLOAT vfredmax_vs_f32m4_f32m1
#define VREDMIN_FLOAT vfredmin_vs_f32m4_f32m1
#define VMIN_FLOAT vfmin_vv_f32m4
#define VMAX_FLOAT vfmax_vv_f32m4
#define VMIN1_FLOAT vfmin_vf_f32m4
#define VMAX1_FLOAT vfmax_vf_f32m4
#define VINTERP_FLOAT_INT vreinterpret_v_f32m4_i32m4
#define VINTERP_INT_FLOAT vreinterpret_v_i32m4_f32m4
#define VCVT_RTZ_FLOAT_INT vfcvt_rtz_x_f_v_i32m4
#define VCVT_FLOAT_INT vfcvt_x_f_v_i32m4
#define VCVT_INT_FLOAT vfcvt_f_x_v_f32m4
#define VMERGE_FLOAT vmerge_vvm_f32m4
static inline vfloat32m4_t VMUL1_FLOAT_MASK(vbool8_t mask, vfloat32m4_t op1, float op2, size_t vl){
return vfmul_vf_f32m4_m(mask, op1, op1, op2, vl);
}
#define VSQRT_FLOAT vfsqrt_v_f32m4
#define VLE_FLOAT_STRIDE vlse32_v_f32m4
#define VEQ1_FLOAT_BOOL vmfeq_vf_f32m4_b8
#define VEQ_FLOAT_BOOL vmfeq_vv_f32m4_b8
#define VGT1_FLOAT_BOOL vmfgt_vf_f32m4_b8
#define VNE1_FLOAT_BOOL vmfne_vf_f32m4_b8
#define VLT1_FLOAT_BOOL vmflt_vf_f32m4_b8
#define VLE1_FLOAT_BOOL vmfle_vf_f32m4_b8
#define VABS_FLOAT vfabs_v_f32m4
#define VMERGE1_FLOAT vfmerge_vfm_f32m4
#define VGATHER_FLOAT vrgather_vv_f32m4
//// INT
#define V_ELT_INT vint32m4_t
#define VLOAD_INT vle32_v_i32m4
#define VLOAD1_INT vmv_v_x_i32m4
#define VSTORE_INT vse32_v_i32m4
#define VADD_INT vadd_vv_i32m4
#define VADD1_INT vadd_vx_i32m4
#define VMUL_INT vmul_vv_i32m4
#define VMUL1_INT vmul_vx_i32m4
#define VSUB_INT vsub_vv_i32m4
#define VSUB1_INT vsub_vx_i32m4
#define VAND1_INT vand_vx_i32m4
#define VAND_INT vand_vv_i32m4
#define VXOR_INT vxor_vv_i32m4
#define VSLL1_INT vsll_vx_i32m4
#define VEQ1_INT_BOOL vmseq_vx_i32m4_b8
#define VEQ_INT_BOOL vmseq_vv_i32m4_b8
#define VGT1_INT_BOOL vmsgt_vx_i32m4_b8
#define VNE1_INT_BOOL vmsne_vx_i32m4_b8
#define VLT1_INT_BOOL vmslt_vx_i32m4_b8
#define VLE1_INT_BOOL vmsle_vx_i32m4_b8
static inline vint32m4_t VADD1_INT_MASK(vbool8_t mask, vint32m4_t op1, int32_t op2, size_t vl){
return vadd_vx_i32m4_m(mask, op1, op1, op2, vl);
}
static inline vint32m4_t VSUB1_INT_MASK(vbool8_t mask, vint32m4_t op1, int32_t op2, size_t vl){
return vsub_vx_i32m4_m(mask, op1, op1, op2, vl);
}
#define VSUB1_INT vsub_vx_i32m4
#define VOR1_INT vor_vx_i32m4
#define VSRA1_INT vsra_vx_i32m4
#define VMIN_INT vmin_vv_i32m4
#define VMIN1_INT vmin_vx_i32m4
#define VMAX_INT vmax_vv_i32m4
#define VMAX1_INT vmax_vx_i32m4
#define VMERGE1_INT vmerge_vxm_i32m4
#define VMERGE_INT vmerge_vvm_i32m4
#define VNEG_INT vneg_v_i32m4
#define VREDSUM_INT vredosum_vs_i32m4_i32m1
#define VREDMAX_INT vredmax_vs_i32m4_i32m1
#define VREDMIN_INT vredmin_vs_i32m4_i32m1
#define VGATHER_INT vrgather_vv_i32m4
#define VNOT_INT vnot_v_i32m4
//// UINT
#define VLOAD_UINT vle32_v_u32m4
#define VSTORE_UINT vse32_v_u32m4
#define V_ELT_UINT vuint32m4_t
#define VCVT_FLOAT_UINT vfcvt_xu_f_v_u32m4
//// SHORT
#define V_ELT_SHORT vint16m4_t
#define VLOAD_SHORT vle16_v_i16m4
#define VLOAD1_SHORT vmv_v_x_i16m4
#define VSTORE_SHORT vse16_v_i16m4
#define VADD_SHORT vadd_vv_i16m4
#define VSUB_SHORT vsub_vv_i16m4
#define VREDSUMW_SHORT vwredsum_vs_i16m4_i32m1
#define VGT_SHORT_BOOL vmsgt_vv_i16m4_b4
#define VMERGE_SHORT vmerge_vvm_i16m4
//// BOOL for 16 bits elements
#define V_ELT_BOOL16 vbool4_t
//// BOOL for 32 bits elements
#define V_ELT_BOOL32 vbool8_t
#define VNOT_BOOL vmnot_m_b8
#define VCLEAR_BOOL vmclr_m_b8
#define VXOR_BOOL vmxor_mm_b8
#define VOR_BOOL vmor_mm_b8
#define VAND_BOOL vmand_mm_b8
#define VANDNOT_BOOL vmandn_mm_b8
/////////////////////////// HALF VECTOR, m2 ///////////////
#define VSETVL32H vsetvl_e32m2
#define VSETVL16H vsetvl_e16m2
//// FLOATH
#define V_ELT_FLOATH vfloat32m2_t
#define VLOAD_FLOATH vle32_v_f32m2
#define VLOAD1_FLOATH vfmv_v_f_f32m2
#define VLOAD_FLOATH2 vlseg2e32_v_f32m2
#define VLOAD_FLOATH_STRIDE vlse32_v_f32m2
#define VSTORE_FLOATH vse32_v_f32m2
#define VSTORE_FLOATHH vse32_v_f32m1
#define VLOAD_FLOATHH vle32_v_f32m1
#define VSTORE_FLOATH2 vsseg2e32_v_f32m2
#define VINTERP_FLOATH_INTH vreinterpret_v_f32m2_i32m2
#define VINTERP_INTH_FLOATH vreinterpret_v_i32m2_f32m2
#define VXOR1_INTH vxor_vx_i32m2
#define VADD_FLOATH vfadd_vv_f32m2
#define VADD1_FLOATH vfadd_vf_f32m2
static inline vfloat32m2_t VADD1_FLOATH_MASK(vbool16_t mask, vfloat32m2_t op1, float op2, size_t vl){
return vfadd_vf_f32m2_m(mask, op1, op1, op2, vl);
}
#define VSUB_FLOATH vfsub_vv_f32m2
#define VSUB1_FLOATH vfsub_vf_f32m2 // v2 = v1 - f
#define VRSUB1_FLOATH vfrsub_vf_f32m2 // v2 = f - v1
#define VMUL_FLOATH vfmul_vv_f32m2
#define VMUL1_FLOATH vfmul_vf_f32m2
static inline vfloat32m2_t VMUL1_FLOATH_MASK(vbool16_t mask, vfloat32m2_t op1, float op2, size_t vl){
return vfmul_vf_f32m2_m(mask, op1, op1, op2, vl);
}
#define VDIV_FLOATH vfdiv_vv_f32m2
#define VDIV1_FLOATH vfdiv_vf_f32m2
#define VRDIV1_FLOATH vfrdiv_vf_f32m2
#define VFMACC_FLOATH vfmacc_vv_f32m2 // d = a + b*c
#define VFMACC1_FLOATH vfmacc_vf_f32m2
#define VFMADD_FLOATH vfmadd_vv_f32m2 // vd[i] = +(vs1[i] * vd[i]) + vs2[i]
#define VFMADD1_FLOATH vfmadd_vf_f32m2
#define VFMSUB_FLOATH vfmsub_vv_f32m2 // d = a*b - c
#define VREDSUM_FLOATH vfredosum_vs_f32m2_f32m1
#define VREDMAX_FLOATH vfredmax_vs_f32m2_f32m1
#define VREDMIN_FLOATH vfredmin_vs_f32m2_f32m1
#define VMIN_FLOATH vfmin_vv_f32m2
#define VMIN1_FLOATH vfmin_vf_f32m2
#define VMAX_FLOATH vfmax_vv_f32m2
#define VMAX1_FLOATH vfmax_vf_f32m2
#define VINTERP_FLOATH_INTH vreinterpret_v_f32m2_i32m2
#define VINTERP_INTH_FLOATH vreinterpret_v_i32m2_f32m2
#define VCVT_RTZ_FLOATH_INTH vfcvt_rtz_x_f_v_i32m2
#define VCVT_FLOATH_INTH vfcvt_x_f_v_i32m2
#define VCVT_INTH_FLOATH vfcvt_f_x_v_f32m2
#define VMERGE_FLOATH vmerge_vvm_f32m2
#define VSQRT_FLOATH vfsqrt_v_f32m2
#define VEQ1_FLOATH_BOOLH vmfeq_vf_f32m2_b16
#define VEQ_FLOATH_BOOLH vmfeq_vv_f32m2_b16
#define VGE1_FLOATH_BOOLH vmfge_vf_f32m2_b16
#define VGT1_FLOATH_BOOLH vmfgt_vf_f32m2_b16
#define VNE1_FLOATH_BOOLH vmfne_vf_f32m2_b16
#define VLT1_FLOATH_BOOLH vmflt_vf_f32m2_b16
#define VLE1_FLOATH_BOOLH vmfle_vf_f32m2_b16
#define VABS_FLOATH vfabs_v_f32m2
#define VMERGE1_FLOATH vfmerge_vfm_f32m2
#define VGATHER_FLOATH vrgather_vv_f32m2
//// INTH
#define V_ELT_INTH vint32m2_t
#define VSTORE_INTHH vse32_v_i32m1
#define VLOAD_INTHH vle32_v_i32m1
#define VLOAD_INTH vle32_v_i32m2
#define VLOAD1_INTH vmv_v_x_i32m2
#define VLOAD1_INTHH vmv_v_x_i32m1
#define VSTORE_INTH vse32_v_i32m2
#define VADD_INTH vadd_vv_i32m2
#define VADD1_INTH vadd_vx_i32m2
static inline vint32m2_t VADD1_INTH_MASK(vbool16_t mask, vint32m2_t op1, int32_t op2, size_t vl){
return vadd_vx_i32m2_m(mask, op1, op1, op2, vl);
}
#define VADD1_INTH_MASKEDOFF vadd_vx_i32m2_m
#define VADD1_INT64H_MASKEDOFF vadd_vx_i64m2_m
#define VMUL_INTH vmul_vv_i32m2
#define VMUL1_INTH vmul_vx_i32m2
#define VSUB_INTH vsub_vv_i32m2
#define VSUB1_INTH vsub_vx_i32m2
static inline vint32m2_t VSUB1_INTH_MASK(vbool16_t mask, vint32m2_t op1, int32_t op2, size_t vl){
return vsub_vx_i32m2_m(mask, op1, op1, op2, vl);
}
#define VAND1_INTH vand_vx_i32m2
#define VAND_INTH vand_vv_i32m2
#define VXOR_INTH vxor_vv_i32m2
#define VSLL1_INTH vsll_vx_i32m2
#define VEQ1_INTH_BOOLH vmseq_vx_i32m2_b16
#define VGT1_INTH_BOOLH vmsgt_vx_i32m2_b16
#define VNE1_INTH_BOOLH vmsne_vx_i32m2_b16
#define VLT1_INTH_BOOLH vmflt_vf_f32m2_b16
#define VLE1_INTH_BOOLH vmsle_vx_i32m2_b16
#define VEQ_INTH_BOOLH vmseq_vv_i32m2_b16
#define VOR1_INTH vor_vx_i32m2
#define VSRA1_INTH vsra_vx_i32m2
#define VMIN_INTH vmin_vv_i32m2
#define VMIN1_INTH vmin_vx_i32m2
#define VMAX_INTH vmax_vv_i32m2
#define VMAX1_INTH vmax_vx_i32m2
#define VNOT_INTH vnot_v_i32m2
#define VMERGE_INTH vmerge_vvm_i32m2
//// UINTH
#define VLOAD_UINTH vle32_v_u32m2
#define V_ELT_UINTH vuint32m2_t
#define VCVT_FLOATH_UINTH vfcvt_xu_f_v_u32m2
//// SHORTH
#define V_ELT_SHORTH vint16m2_t
#define VLOAD_SHORTH vle16_v_i16m2
#define VLOAD1_SHORTH vmv_v_x_i16m2
#define VSTORE_SHORTH vse16_v_i16m2
#define VADD_SHORTH vadd_vv_i16m2
#define VREDSUMW_SHORTH vwredsum_vs_i16m4_i32m1
#define VCVT_INT_SHORTH vnclip_wx_i16m2
#if __riscv_v != 7000
#define VCVT_SHORTH_INT vsext_vf2_i32m4
#else
#define VCVT_SHORTH_INT(a, b) vwmul_vx_i32m4((a), 1, (b))
#endif
//// USHORTH
#define V_ELT_USHORTH vuint16m2_t
#define VLOAD_USHORTH vle16_v_u16m2
#define VSTORE_USHORTH vse16_v_u16m2
#define VCVT_UINT_USHORTH vnclipu_wx_u16m2
//// UBYTEHH
#define V_ELT_UBYTEHH vuint8m1_t
#define VLOAD_UBYTEHH vle8_v_u8m1
#define VSTORE_UBYTEHH vse8_v_u8m1
#define VCVT_USHORTH_UBYTEHH vnclipu_wx_u8m1
//// BOOL for Half length vector 32 bits elements
#define V_ELT_BOOL32H vbool16_t
#define VNOT_BOOLH vmnot_m_b16
#define VCLEAR_BOOLH vmclr_m_b16
#define VXOR_BOOLH vmxor_mm_b16
#define VOR_BOOLH vmor_mm_b16
#define VAND_BOOLH vmand_mm_b16
#define VANDNOT_BOOLH vmandn_mm_b16
//#define VANDNOT_BOOLH vmnand_mm_b16
#if ELEN >= 64
#define VSETVL64 vsetvl_e64m4
//// DOUBLE
#define V_ELT_DOUBLE vfloat64m4_t
#define VLOAD_DOUBLE vle64_v_f64m4
#define VLOAD1_DOUBLE vfmv_v_f_f64m4
#define VSTORE_DOUBLE vse64_v_f64m4
#define VADD_DOUBLE vfadd_vv_f64m4
#define VADD1_DOUBLE vfadd_vf_f64m4
#define VSUB_DOUBLE vfsub_vv_f64m4
#define VSUB1_DOUBLE vfsub_vf_f64m4
#define VMUL_DOUBLE vfmul_vv_f64m4
#define VMUL1_DOUBLE vfmul_vf_f64m4
#define VDIV_DOUBLE vfdiv_vv_f64m4
#define VFMA_DOUBLE vfmacc_vv_f64m4 // d = a + b*c
#define VFMA1_DOUBLE vfmacc_vf_f64m4
#define VFMSUB_DOUBLE vfmsub_vv_f64m4 // d = a*b - c
#define VREDSUM_DOUBLE vfredosum_vs_f64m4_f64m1
#define VREDMAX_DOUBLE vfredmax_vs_f64m4_f64m1
#define VREDMIN_DOUBLE vfredmin_vs_f64m4_f64m1
#define VMIN_DOUBLE vfmin_vv_f64m4
#define VMAX_DOUBLE vfmax_vv_f64m4
#define VMIN1_DOUBLE vfmin_vf_f64m4
#define VMAX1_DOUBLE vfmax_vf_f64m4
#define VINTERP_DOUBLE_INT vreinterpret_v_f64m4_i64m4
#define VINTERP_INT_DOUBLE vreinterpret_v_i64m4_f64m4
#define VCVT_RTZ_DOUBLE_INT vfcvt_rtz_x_f_v_i64m4
#define VCVT_DOUBLE_INT vfcvt_x_f_v_i64m4
#define VCVT_INT_DOUBLE vfcvt_f_x_v_f64m4
#define VABS_DOUBLE vfabs_v_f64m4
#define VSQRT_DOUBLE vfsqrt_v_f64m4
#define VCVT_DOUBLE_FLOAT vfncvt_f_f_w_f32m2
#define VCVT_FLOAT_DOUBLE vfwcvt_f_f_v_f64m4
//// DOUBLEH
#define VSETVL64H vsetvl_e64m2
#define V_ELT_DOUBLEH vfloat64m2_t
#define VLOAD_DOUBLEH2 vlseg2e64_v_f64m2
#define VLOAD_DOUBLEH_STRIDE vlse64_v_f64m2
#define VSTORE_DOUBLEH2 vsseg2e64_v_f64m2
#define VLOAD_DOUBLEH vle64_v_f64m2
#define VLOAD1_DOUBLEH vfmv_v_f_f64m2
#define VSTORE_DOUBLEH vse64_v_f64m2
#define VADD_DOUBLEH vfadd_vv_f64m2
#define VADD1_DOUBLEH vfadd_vf_f64m2
#define VSUB_DOUBLEH vfsub_vv_f64m2
#define VSUB1_DOUBLEH vfsub_vf_f64m2
#define VRSUB1_DOUBLEH vfrsub_vf_f64m2 // v2 = f - v1
#define VMUL_DOUBLEH vfmul_vv_f64m2
#define VMUL1_DOUBLEH vfmul_vf_f64m2
#define VDIV_DOUBLEH vfdiv_vv_f64m2
#define VFMACC_DOUBLEH vfmacc_vv_f64m2 // d = a + b*c
#define VFMACC1_DOUBLEH vfmacc_vf_f64m2
#define VFMADD_DOUBLEH vfmadd_vv_f64m2
#define VFMADD1_DOUBLEH vfmadd_vf_f64m2
#define VFMA1_DOUBLEH vfmacc_vf_f64m2
#define VFMSUB_DOUBLEH vfmsub_vv_f64m2 // d = a*b - c
#define VREDSUM_DOUBLEH vfredosum_vs_f64m2_f64m1
#define VREDMAX_DOUBLEH vfredmax_vs_f64m2_f64m1
#define VREDMIN_DOUBLEH vfredmin_vs_f64m2_f64m1
#define VMIN_DOUBLEH vfmin_vv_f64m2
#define VMAX_DOUBLEH vfmax_vv_f64m2
#define VMIN1_DOUBLEH vfmin_vf_f64m2
#define VMAX1_DOUBLEH vfmax_vf_f64m2
#define VINTERP_DOUBLEH_INTH vreinterpret_v_f64m2_i64m2
#define VINTERP_INTH_DOUBLEH vreinterpret_v_i64m2_f64m2
#define VCVT_RTZ_DOUBLEH_INTH vfcvt_rtz_x_f_v_i64m2
#define VCVT_DOUBLEH_INTH vfcvt_x_f_v_i64m2
#define VCVT_INTH_DOUBLEH vfcvt_f_x_v_f64m2
#define VABS_DOUBLEH vfabs_v_f64m2
#define VSQRT_DOUBLEH vfsqrt_v_f64m2
#define VCVT_DOUBLEH_FLOATH vfncvt_f_f_w_f32m2
#define VCVT_FLOATH_DOUBLEH vfwcvt_f_f_v_f64m2
#define VLT1_DOUBLEH_BOOLH vmflt_vf_f64m2_b32
#define VMERGE_DOUBLEH vmerge_vvm_f64m2
#define VMERGE1_DOUBLEH vfmerge_vfm_f64m2
static inline vfloat64m2_t VMUL1_DOUBLEH_MASK(vbool32_t mask, vfloat64m2_t op1, double op2, size_t vl){
return vfmul_vf_f64m2_m(mask, op1, op1, op2, vl);
}
static inline vfloat64m2_t VADD1_DOUBLEH_MASK(vbool32_t mask, vfloat64m2_t op1, double op2, size_t vl){
return vfadd_vf_f64m2_m(mask, op1, op1, op2, vl);
}
#define VADD1_DOUBLEH_MASKEDOFF vfadd_vf_f64m2_m
#define VEQ1_DOUBLEH_BOOLH vmfeq_vf_f64m2_b32
#define VEQ_DOUBLEH_BOOLH vmfeq_vv_f64m2_b32
#define VGE1_DOUBLEH_BOOLH vmfge_vf_f64m2_b32
#define VGT1_DOUBLEH_BOOLH vmfgt_vf_f64m2_b32
#define VNE1_DOUBLEH_BOOLH vmfne_vf_f64m2_b32
#define VLT1_DOUBLEH_BOOLH vmflt_vf_f64m2_b32
#define VLE1_DOUBLEH_BOOLH vmfle_vf_f64m2_b32
// INT64H
#define V_ELT_INT64H vint64m2_t
#define VLOAD1_INT64H vmv_v_x_i64m2
#define VSLL1_INT64H vsll_vx_i64m2
#define VADD1_INT64H vadd_vx_i64m2
static inline vint64m2_t VADD1_INT64H_MASK(vbool32_t mask, vint64m2_t op1, int64_t op2, size_t vl){
return vadd_vx_i64m2_m(mask, op1, op1, op2, vl);
}
static inline vint64m2_t VSUB1_INT64H_MASK(vbool32_t mask, vint64m2_t op1, int64_t op2, size_t vl){
return vsub_vx_i64m2_m(mask, op1, op1, op2, vl);
}
#define VAND1_INT64H vand_vx_i64m2
#define VXOR_INT64H vxor_vv_i64m2
#define VNE1_INTH_BOOL64H vmsne_vx_i64m2_b32
#define VEQ1_INTH_BOOL64H vmseq_vx_i64m2_b32
#define VEQ1_INTH_BOOL64H vmseq_vx_i64m2_b32
#define VGT1_INTH_BOOL64H vmsgt_vx_i64m2_b32
#define VMERGE_INT64H vmerge_vvm_i64m2
#define VMERGE1_INT64H vmerge_vxm_i64m2
#define VSTORE_INT64H vse64_v_i64m2
#define vfcvt_rtz_x_f_v_i64m4 vfcvt_x_f_v_i64m4
#define vfcvt_rtz_x_f_v_i64m2 vfcvt_x_f_v_i64m2
//BOOL64H
#define V_ELT_BOOL64H vbool32_t
#define VNOT_BOOL64H vmnot_m_b32
#define VCLEAR_BOOL64H vmclr_m_b32
#define VXOR_BOOL64H vmxor_mm_b32
#define VOR_BOOL64H vmor_mm_b32
#define VAND_BOOL64H vmand_mm_b32
#define VANDNOT_BOOL64H vmandn_mm_b32
#endif // ELEN >= 64
#else // RVV >= 0.12
///////////////////// FULL VECTOR m4 //////////////
#define VSETVL32 __riscv_vsetvl_e32m4
#define VSETVL16 __riscv_vsetvl_e16m4
//// FLOAT
#define V_ELT_FLOAT vfloat32m4_t
#define VLOAD_FLOAT __riscv_vle32_v_f32m4
#define VLOAD1_FLOAT __riscv_vfmv_v_f_f32m4
#define VSTORE_FLOAT __riscv_vse32_v_f32m4
#define VADD_FLOAT __riscv_vfadd_vv_f32m4
#define VADD1_FLOAT __riscv_vfadd_vf_f32m4
#define VSUB_FLOAT __riscv_vfsub_vv_f32m4
#define VSUB1_FLOAT __riscv_vfsub_vf_f32m4
#define VRSUB1_FLOAT __riscv_vfrsub_vf_f32m4 // v2 = f - v1
#define VMUL_FLOAT __riscv_vfmul_vv_f32m4
#define VMUL1_FLOAT __riscv_vfmul_vf_f32m4
#define VDIV_FLOAT __riscv_vfdiv_vv_f32m4
#define VDIV1_FLOAT __riscv_vfdiv_vf_f32m4
#define VRDIV1_FLOAT __riscv_vfrdiv_vf_f32m4
#define VFMACC_FLOAT __riscv_vfmacc_vv_f32m4 // vd[i] = +(vs1[i] * vs2[i]) + vd[i]
#define VFMACC1_FLOAT __riscv_vfmacc_vf_f32m4
#define VFMADD_FLOAT __riscv_vfmadd_vv_f32m4 // vd[i] = +(vs1[i] * vd[i]) + vs2[i]
#define VFMADD1_FLOAT __riscv_vfmadd_vf_f32m4
#define VFMSUB_FLOAT __riscv_vfmsub_vv_f32m4 // d = a*b - c
#define VREDSUM_FLOAT __riscv_vfredosum_vs_f32m4_f32m1_tu
/*static inline vfloat32m1_t VREDSUM_FLOAT(vfloat32m1_t dest,
vfloat32m4_t vector, vfloat32m1_t scalar, size_t vl){
return __riscv_vfredosum_vs_f32m4_f32m1(vector, scalar, vl);
}*/
#define VREDMAX_FLOAT __riscv_vfredmax_vs_f32m4_f32m1_tu
#define VREDMIN_FLOAT __riscv_vfredmin_vs_f32m4_f32m1_tu
#define VMIN_FLOAT __riscv_vfmin_vv_f32m4
#define VMAX_FLOAT __riscv_vfmax_vv_f32m4
#define VMIN1_FLOAT __riscv_vfmin_vf_f32m4
#define VMAX1_FLOAT __riscv_vfmax_vf_f32m4
#define VINTERP_FLOAT_INT __riscv_vreinterpret_v_f32m4_i32m4
#define VINTERP_INT_FLOAT __riscv_vreinterpret_v_i32m4_f32m4
#define VCVT_RTZ_FLOAT_INT __riscv_vfcvt_rtz_x_f_v_i32m4
#define VCVT_FLOAT_INT __riscv_vfcvt_x_f_v_i32m4
#define VCVT_INT_FLOAT __riscv_vfcvt_f_x_v_f32m4
static inline vfloat32m4_t VMERGE_FLOAT(vbool8_t mask, vfloat32m4_t op1,
vfloat32m4_t op2, size_t vl){
return __riscv_vmerge_vvm_f32m4(op1, op2, mask, vl);
}
#define VMUL1_FLOAT_MASK __riscv_vfmul_vf_f32m4_m
#define VSQRT_FLOAT __riscv_vfsqrt_v_f32m4
#define VLE_FLOAT_STRIDE __riscv_vlse32_v_f32m4
#define VEQ1_FLOAT_BOOL __riscv_vmfeq_vf_f32m4_b8
#define VEQ_FLOAT_BOOL __riscv_vmfeq_vv_f32m4_b8
#define VGT1_FLOAT_BOOL __riscv_vmfgt_vf_f32m4_b8
#define VNE1_FLOAT_BOOL __riscv_vmfne_vf_f32m4_b8
#define VLT1_FLOAT_BOOL __riscv_vmflt_vf_f32m4_b8
#define VLE1_FLOAT_BOOL __riscv_vmfle_vf_f32m4_b8
#define VABS_FLOAT __riscv_vfabs_v_f32m4
static inline vfloat32m4_t VMERGE1_FLOAT(vbool8_t mask, vfloat32m4_t op1,
float op2, size_t vl){
return __riscv_vfmerge_vfm_f32m4(op1, op2, mask, vl);
}
#define VGATHER_FLOAT __riscv_vrgather_vv_f32m4
#define vfmv_v_f_f32m1 __riscv_vfmv_v_f_f32m1
//// INT
#define V_ELT_INT vint32m4_t
#define VLOAD_INT __riscv_vle32_v_i32m4
#define VLOAD1_INT __riscv_vmv_v_x_i32m4
#define VSTORE_INT __riscv_vse32_v_i32m4
#define VADD_INT __riscv_vadd_vv_i32m4
#define VADD1_INT __riscv_vadd_vx_i32m4
#define VMUL_INT __riscv_vmul_vv_i32m4
#define VMUL1_INT __riscv_vmul_vx_i32m4
#define VSUB_INT __riscv_vsub_vv_i32m4
#define VSUB1_INT __riscv_vsub_vx_i32m4
#define VAND1_INT __riscv_vand_vx_i32m4
#define VAND_INT __riscv_vand_vv_i32m4
#define VXOR_INT __riscv_vxor_vv_i32m4
#define VSLL1_INT __riscv_vsll_vx_i32m4
#define VEQ1_INT_BOOL __riscv_vmseq_vx_i32m4_b8
#define VEQ_INT_BOOL __riscv_vmseq_vv_i32m4_b8
#define VGT1_INT_BOOL __riscv_vmsgt_vx_i32m4_b8
#define VNE1_INT_BOOL __riscv_vmsne_vx_i32m4_b8
#define VLT1_INT_BOOL __riscv_vmslt_vx_i32m4_b8
#define VLE1_INT_BOOL __riscv_vmsle_vx_i32m4_b8
#define VADD1_INT_MASK __riscv_vadd_vx_i32m4_m
#define VSUB1_INT_MASK __riscv_vsub_vx_i32m4_m
#define VSUB1_INT __riscv_vsub_vx_i32m4
#define VOR1_INT __riscv_vor_vx_i32m4
#define VSRA1_INT __riscv_vsra_vx_i32m4
#define VMIN_INT __riscv_vmin_vv_i32m4
#define VMIN1_INT __riscv_vmin_vx_i32m4
#define VMAX_INT __riscv_vmax_vv_i32m4
#define VMAX1_INT __riscv_vmax_vx_i32m4
static inline vint32m4_t VMERGE_INT(vbool8_t mask, vint32m4_t op1,
vint32m4_t op2, size_t vl){
return __riscv_vmerge_vvm_i32m4(op1, op2, mask, vl);
}
static inline vint32m4_t VMERGE1_INT(vbool8_t mask, vint32m4_t op1,
int32_t op2, size_t vl){
return __riscv_vmerge_vxm_i32m4(op1, op2, mask, vl);
}
#define VNEG_INT __riscv_vneg_v_i32m4
#define VREDSUM_INT vredsum_vs_i32m4_i32m1_tu
/*static inline vint32m1_t VREDSUM_INT(vint32m1_t dest,
vint32m4_t vector, vint32m1_t scalar, size_t vl){
return __riscv_vredsum_vs_i32m4_i32m1(vector, scalar, vl);
}*/
#define VREDMAX_INT __riscv_vredmax_vs_i32m4_i32m1_tu
#define VREDMIN_INT __riscv_vredmin_vs_i32m4_i32m1_tu
#define VGATHER_INT __riscv_vrgather_vv_i32m4
#define VNOT_INT __riscv_vnot_v_i32m4
//// UINT
#define VLOAD_UINT __riscv_vle32_v_u32m4
#define VSTORE_UINT __riscv_vse32_v_u32m4
#define V_ELT_UINT vuint32m4_t
#define VCVT_FLOAT_UINT __riscv_vfcvt_xu_f_v_u32m4
//// SHORT
#define V_ELT_SHORT vint16m4_t
#define VLOAD_SHORT __riscv_vle16_v_i16m4
#define VLOAD1_SHORT __riscv_vmv_v_x_i16m4
#define VSTORE_SHORT __riscv_vse16_v_i16m4
#define VADD_SHORT __riscv_vadd_vv_i16m4
#define VSUB_SHORT __riscv_vsub_vv_i16m4
#define VREDSUMW_SHORT __riscv_vwredsum_vs_i16m4_i32m1_tu
#define VGT_SHORT_BOOL __riscv_vmsgt_vv_i16m4_b4
static inline vint16m4_t VMERGE_SHORT(vbool4_t mask, vint16m4_t op1,
vint16m4_t op2, size_t vl){
return __riscv_vmerge_vvm_i16m4(op1, op2, mask, vl);
}
//// BOOL for 16 bits elements
#define V_ELT_BOOL16 vbool4_t
//// BOOL for 32 bits elements
#define V_ELT_BOOL32 vbool8_t
#define VNOT_BOOL __riscv_vmnot_m_b8
#define VCLEAR_BOOL __riscv_vmclr_m_b8
#define VXOR_BOOL __riscv_vmxor_mm_b8
#define VOR_BOOL __riscv_vmor_mm_b8
#define VAND_BOOL __riscv_vmand_mm_b8
#define VANDNOT_BOOL __riscv_vmandn_mm_b8
/////////////////////////// HALF VECTOR, m2 ///////////////
#define VSETVL32H __riscv_vsetvl_e32m2
#define VSETVL16H __riscv_vsetvl_e16m2
//// FLOATH
#define V_ELT_FLOATH vfloat32m2_t
#define VLOAD_FLOATH __riscv_vle32_v_f32m2
#define VLOAD1_FLOATH __riscv_vfmv_v_f_f32m2
static inline void VLOAD_FLOATH2(vfloat32m2_t *v0, vfloat32m2_t *v1, float* base, size_t vl){
vfloat32m2x2_t v = __riscv_vlseg2e32_v_f32m2x2(base, vl);
*v0 = __riscv_vget_v_f32m2x2_f32m2(v, 0);
*v1 = __riscv_vget_v_f32m2x2_f32m2(v, 1);
}
static inline void VSTORE_FLOATH2(float* base, vfloat32m2_t v0, vfloat32m2_t v1, size_t vl){
vfloat32m2x2_t v = __riscv_vcreate_v_f32m2x2(v0, v1);
__riscv_vsseg2e32_v_f32m2x2(base, v, vl);
}
#define VSTORE_FLOATHH __riscv_vse32_v_f32m1
#define VLOAD_FLOATHH __riscv_vle32_v_f32m1
#define VLOAD_FLOATH_STRIDE __riscv_vlse32_v_f32m2
#define VSTORE_FLOATH __riscv_vse32_v_f32m2
#define VINTERP_FLOATH_INTH __riscv_vreinterpret_v_f32m2_i32m2
#define VINTERP_INTH_FLOATH __riscv_vreinterpret_v_i32m2_f32m2
#define VXOR1_INTH __riscv_vxor_vx_i32m2
#define VADD_FLOATH __riscv_vfadd_vv_f32m2
#define VADD1_FLOATH __riscv_vfadd_vf_f32m2
#define VADD1_FLOATH_MASK __riscv_vfadd_vf_f32m2_m
#define VSUB_FLOATH __riscv_vfsub_vv_f32m2
#define VSUB1_FLOATH __riscv_vfsub_vf_f32m2 // v2 = v1 - f
#define VRSUB1_FLOATH __riscv_vfrsub_vf_f32m2 // v2 = f - v1
#define VMUL_FLOATH __riscv_vfmul_vv_f32m2
#define VMUL1_FLOATH __riscv_vfmul_vf_f32m2
#define VMUL1_FLOATH_MASK __riscv_vfmul_vf_f32m2_m
#define VDIV_FLOATH __riscv_vfdiv_vv_f32m2
#define VDIV1_FLOATH __riscv_vfdiv_vf_f32m2
#define VRDIV1_FLOATH __riscv_vfrdiv_vf_f32m2
#define VFMACC_FLOATH __riscv_vfmacc_vv_f32m2 // d = a + b*c
#define VFMACC1_FLOATH __riscv_vfmacc_vf_f32m2
#define VFMADD_FLOATH __riscv_vfmadd_vv_f32m2 // vd[i] = +(vs1[i] * vd[i]) + vs2[i]
#define VFMADD1_FLOATH __riscv_vfmadd_vf_f32m2
#define VFMSUB_FLOATH __riscv_vfmsub_vv_f32m2 // d = a*b - c
#define VREDSUM_FLOATH __riscv_vfredosum_vs_f32m2_f32m1_tu
/*static inline vfloat32m1_t VREDSUM_FLOATH(vfloat32m1_t dest,
vfloat32m2_t vector, vfloat32m1_t scalar, size_t vl){
return __riscv_vfredosum_vs_f32m2_f32m1(vector, scalar, vl);
}*/
#define VREDMAX_FLOATH __riscv_vfredmax_vs_f32m2_f32m1_tu
#define VREDMIN_FLOATH __riscv_vfredmin_vs_f32m2_f32m1_tu
#define VMIN_FLOATH __riscv_vfmin_vv_f32m2
#define VMIN1_FLOATH __riscv_vfmin_vf_f32m2
#define VMAX_FLOATH __riscv_vfmax_vv_f32m2
#define VMAX1_FLOATH __riscv_vfmax_vf_f32m2
#define VINTERP_FLOATH_INTH __riscv_vreinterpret_v_f32m2_i32m2
#define VINTERP_INTH_FLOATH __riscv_vreinterpret_v_i32m2_f32m2
#define VCVT_RTZ_FLOATH_INTH __riscv_vfcvt_rtz_x_f_v_i32m2
#define VCVT_FLOATH_INTH __riscv_vfcvt_x_f_v_i32m2
#define VCVT_INTH_FLOATH __riscv_vfcvt_f_x_v_f32m2
static inline vfloat32m2_t VMERGE_FLOATH(vbool16_t mask, vfloat32m2_t op1,
vfloat32m2_t op2, size_t vl){
return __riscv_vmerge_vvm_f32m2(op1, op2, mask, vl);
}
#define VSQRT_FLOATH __riscv_vfsqrt_v_f32m2
#define VEQ1_FLOATH_BOOLH __riscv_vmfeq_vf_f32m2_b16
#define VEQ_FLOATH_BOOLH __riscv_vmfeq_vv_f32m2_b16
#define VGE1_FLOATH_BOOLH __riscv_vmfge_vf_f32m2_b16
#define VGT1_FLOATH_BOOLH __riscv_vmfgt_vf_f32m2_b16
#define VNE1_FLOATH_BOOLH __riscv_vmfne_vf_f32m2_b16
#define VLT1_FLOATH_BOOLH __riscv_vmflt_vf_f32m2_b16
#define VLE1_FLOATH_BOOLH __riscv_vmfle_vf_f32m2_b16
#define VABS_FLOATH __riscv_vfabs_v_f32m2
static inline vfloat32m2_t VMERGE1_FLOATH(vbool16_t mask, vfloat32m2_t op1,
float op2, size_t vl){
return __riscv_vfmerge_vfm_f32m2(op1, op2, mask, vl);
}
#define VGATHER_FLOATH __riscv_vrgather_vv_f32m2
//// INTH
#define V_ELT_INTH vint32m2_t
#define VSTORE_INTHH __riscv_vse32_v_i32m1
#define VLOAD_INTHH __riscv_vle32_v_i32m1
#define VLOAD_INTH __riscv_vle32_v_i32m2
#define VLOAD1_INTH __riscv_vmv_v_x_i32m2
#define VLOAD1_INTHH __riscv_vmv_v_x_i32m1
#define VSTORE_INTH __riscv_vse32_v_i32m2
#define VADD_INTH __riscv_vadd_vv_i32m2
#define VADD1_INTH __riscv_vadd_vx_i32m2
#define VADD1_INTH_MASK __riscv_vadd_vx_i32m2_m
static inline vint32m2_t VADD1_INTH_MASKEDOFF(vbool16_t mask, vint32m2_t op1, vint32m2_t maskedoff,
int32_t op2, size_t vl){
op1 = __riscv_vmerge_vvm_i32m2(maskedoff, op1, mask, vl);
return __riscv_vadd_vx_i32m2_m(mask, op1, op2, vl);
}
static inline vint64m2_t VADD1_INT64H_MASKEDOFF(vbool32_t mask, vint64m2_t op1, vint64m2_t maskedoff,
int64_t op2, size_t vl){
op1 = __riscv_vmerge_vvm_i64m2(maskedoff, op1, mask, vl);
return __riscv_vadd_vx_i64m2_m(mask, op1, op2, vl);
}
#define VMUL_INTH __riscv_vmul_vv_i32m2
#define VMUL1_INTH __riscv_vmul_vx_i32m2
#define VSUB_INTH __riscv_vsub_vv_i32m2
#define VSUB1_INTH __riscv_vsub_vx_i32m2
#define VSUB1_INTH_MASK __riscv_vsub_vx_i32m2_m
#define VAND1_INTH __riscv_vand_vx_i32m2
#define VAND_INTH __riscv_vand_vv_i32m2
#define VXOR_INTH __riscv_vxor_vv_i32m2
#define VSLL1_INTH __riscv_vsll_vx_i32m2
#define VEQ1_INTH_BOOLH __riscv_vmseq_vx_i32m2_b16
#define VGT1_INTH_BOOLH __riscv_vmsgt_vx_i32m2_b16
#define VNE1_INTH_BOOLH __riscv_vmsne_vx_i32m2_b16
#define VLT1_INTH_BOOLH __riscv_vmflt_vf_f32m2_b16
#define VLE1_INTH_BOOLH __riscv_vmsle_vx_i32m2_b16
#define VEQ_INTH_BOOLH __riscv_vmseq_vv_i32m2_b16
#define VOR1_INTH __riscv_vor_vx_i32m2
#define VSRA1_INTH __riscv_vsra_vx_i32m2
#define VMIN_INTH __riscv_vmin_vv_i32m2
#define VMIN1_INTH __riscv_vmin_vx_i32m2
#define VMAX_INTH __riscv_vmax_vv_i32m2
#define VMAX1_INTH __riscv_vmax_vx_i32m2
#define VNOT_INTH __riscv_vnot_v_i32m2
static inline vint32m2_t VMERGE_INTH(vbool16_t mask, vint32m2_t op1,
vint32m2_t op2, size_t vl){
return __riscv_vmerge_vvm_i32m2(op1, op2, mask, vl);
}
//// UINTH
#define VLOAD_UINTH __riscv_vle32_v_u32m2
#define V_ELT_UINTH __riscv_vuint32m2_t
#define VCVT_FLOATH_UINTH __riscv_vfcvt_xu_f_v_u32m2
//// SHORTH
#define V_ELT_SHORTH vint16m2_t
#define VLOAD_SHORTH __riscv_vle16_v_i16m2
#define VLOAD1_SHORTH __riscv_vmv_v_x_i16m2
#define VSTORE_SHORTH __riscv_vse16_v_i16m2
#define VADD_SHORTH __riscv_vadd_vv_i16m2
#define VREDSUMW_SHORTH __riscv_vwredsum_vs_i16m4_i32m1_tu
static inline vint16m2_t VCVT_INT_SHORTH (vint32m4_t op1, size_t shift, size_t vl){
return __riscv_vnclip_wx_i16m2(op1, shift, __RISCV_VXRM_RNU, vl);
}
#define VCVT_SHORTH_INT __riscv_vsext_vf2_i32m4
//// USHORTH
#define V_ELT_USHORTH vuint16m2_t
#define VLOAD_USHORTH __riscv_vle16_v_u16m2
#define VSTORE_USHORTH __riscv_vse16_v_u16m2
static inline vuint16m2_t VCVT_UINT_USHORTH (vuint32m4_t op1, size_t shift, size_t vl){
return __riscv_vnclipu_wx_u16m2(op1, shift, __RISCV_VXRM_RNU, vl);
}
//// UBYTEHH
#define V_ELT_UBYTEHH vuint8m1_t
#define VLOAD_UBYTEHH __riscv_vle8_v_u8m1
#define VSTORE_UBYTEHH __riscv_vse8_v_u8m1
static inline vuint8m1_t VCVT_USHORTH_UBYTEHH (vuint16m2_t op1, size_t shift, size_t vl){
return __riscv_vnclipu_wx_u8m1(op1, shift, __RISCV_VXRM_RNU, vl);
}
//// BOOL for Half length __riscv_vector 32 bits elements
#define V_ELT_BOOL32H vbool16_t
#define VNOT_BOOLH __riscv_vmnot_m_b16
#define VCLEAR_BOOLH __riscv_vmclr_m_b16
#define VXOR_BOOLH __riscv_vmxor_mm_b16
#define VOR_BOOLH __riscv_vmor_mm_b16
#define VAND_BOOLH __riscv_vmand_mm_b16
#define VANDNOT_BOOLH __riscv_vmandn_mm_b16
//#define VANDNOT_BOOLH __riscv_vmnand_mm_b16
#if ELEN >= 64
#define VSETVL64 __riscv_vsetvl_e64m4
#define VSETVL64H __riscv_vsetvl_e64m2
//// DOUBLE
#define V_ELT_DOUBLE vfloat64m4_t
#define VLOAD_DOUBLE __riscv_vle64_v_f64m4
#define VLOAD1_DOUBLE __riscv_vfmv_v_f_f64m4
#define VSTORE_DOUBLE __riscv_vse64_v_f64m4
#define VADD_DOUBLE __riscv_vfadd_vv_f64m4
#define VADD1_DOUBLE __riscv_vfadd_vf_f64m4
#define VSUB_DOUBLE __riscv_vfsub_vv_f64m4
#define VSUB1_DOUBLE __riscv_vfsub_vf_f64m4
#define VMUL_DOUBLE __riscv_vfmul_vv_f64m4
#define VMUL1_DOUBLE __riscv_vfmul_vf_f64m4
#define VDIV_DOUBLE __riscv_vfdiv_vv_f64m4
#define VFMA_DOUBLE __riscv_vfmacc_vv_f64m4 // d = a + b*c
#define VFMA1_DOUBLE __riscv_vfmacc_vf_f64m4
#define VFMSUB_DOUBLE __riscv_vfmsub_vv_f64m4 // d = a*b - c
#define VREDSUM_DOUBLE __riscv_vfredosum_vs_f64m4_f64m1_tu
#define VREDMAX_DOUBLE __riscv_vfredmax_vs_f64m4_f64m1_tu
#define VREDMIN_DOUBLE __riscv_vfredmin_vs_f64m4_f64m1_tu
#define VMIN_DOUBLE __riscv_vfmin_vv_f64m4
#define VMAX_DOUBLE __riscv_vfmax_vv_f64m4
#define VMIN1_DOUBLE __riscv_vfmin_vf_f64m4
#define VMAX1_DOUBLE __riscv_vfmax_vf_f64m4
#define VINTERP_DOUBLE_INT __riscv_vreinterpret_v_f64m4_i64m4
#define VINTERP_INT_DOUBLE __riscv_vreinterpret_v_i64m4_f64m4
#define VCVT_RTZ_DOUBLE_INT __riscv_vfcvt_rtz_x_f_v_i64m4
#define VCVT_DOUBLE_INT __riscv_vfcvt_x_f_v_i64m4
#define VCVT_INT_DOUBLE __riscv_vfcvt_f_x_v_f64m4
#define VABS_DOUBLE __riscv_vfabs_v_f64m4
#define VSQRT_DOUBLE __riscv_vfsqrt_v_f64m4
#define VCVT_DOUBLE_FLOAT __riscv_vfncvt_f_f_w_f32m2
#define VCVT_FLOAT_DOUBLE __riscv_vfwcvt_f_f_v_f64m4
//// DOUBLEH
#define V_ELT_DOUBLEH vfloat64m2_t
static inline void VLOAD_DOUBLEH2(vfloat64m2_t *v0, vfloat64m2_t *v1, double* base, size_t vl){
vfloat64m2x2_t v = __riscv_vlseg2e64_v_f64m2x2(base, vl);
*v0 = __riscv_vget_v_f64m2x2_f64m2(v, 0);
*v1 = __riscv_vget_v_f64m2x2_f64m2(v, 1);
}
static inline void VSTORE_DOUBLEH2(double* base, vfloat64m2_t v0, vfloat64m2_t v1, size_t vl){
vfloat64m2x2_t v = __riscv_vcreate_v_f64m2x2(v0, v1);
__riscv_vsseg2e64_v_f64m2x2(base, v, vl);
}
#define VLOAD_DOUBLEH_STRIDE __riscv_vlse64_v_f64m2x2
#define VLOAD_DOUBLEH __riscv_vle64_v_f64m2
#define VLOAD1_DOUBLEH __riscv_vfmv_v_f_f64m2
#define VSTORE_DOUBLEH __riscv_vse64_v_f64m2
#define VADD_DOUBLEH __riscv_vfadd_vv_f64m2
#define VADD1_DOUBLEH __riscv_vfadd_vf_f64m2
#define VSUB_DOUBLEH __riscv_vfsub_vv_f64m2
#define VSUB1_DOUBLEH __riscv_vfsub_vf_f64m2
#define VRSUB1_DOUBLEH __riscv_vfrsub_vf_f64m2 // v2 = f - v1
#define VMUL_DOUBLEH __riscv_vfmul_vv_f64m2
#define VMUL1_DOUBLEH __riscv_vfmul_vf_f64m2
#define VDIV_DOUBLEH __riscv_vfdiv_vv_f64m2
#define VFMACC_DOUBLEH __riscv_vfmacc_vv_f64m2 // d = a + b*c
#define VFMACC1_DOUBLEH __riscv_vfmacc_vf_f64m2
#define VFMADD_DOUBLEH __riscv_vfmadd_vv_f64m2
#define VFMADD1_DOUBLEH __riscv_vfmadd_vf_f64m2
#define VFMA1_DOUBLEH __riscv_vfmacc_vf_f64m2
#define VFMSUB_DOUBLEH __riscv_vfmsub_vv_f64m2 // d = a*b - c
#define VREDSUM_DOUBLEH __riscv_vfredosum_vs_f64m2_f64m1_tu
#define VREDMAX_DOUBLEH __riscv_vfredmax_vs_f64m2_f64m1_tu
#define VREDMIN_DOUBLEH __riscv_vfredmin_vs_f64m2_f64m1_tu
#define VMIN_DOUBLEH __riscv_vfmin_vv_f64m2
#define VMAX_DOUBLEH __riscv_vfmax_vv_f64m2
#define VMIN1_DOUBLEH __riscv_vfmin_vf_f64m2
#define VMAX1_DOUBLEH __riscv_vfmax_vf_f64m2
#define VINTERP_DOUBLEH_INTH __riscv_vreinterpret_v_f64m2_i64m2
#define VINTERP_INTH_DOUBLEH __riscv_vreinterpret_v_i64m2_f64m2
#define VCVT_RTZ_DOUBLEH_INTH __riscv_vfcvt_rtz_x_f_v_i64m2
#define VCVT_DOUBLEH_INTH __riscv_vfcvt_x_f_v_i64m2
#define VCVT_INTH_DOUBLEH __riscv_vfcvt_f_x_v_f64m2
#define VABS_DOUBLEH __riscv_vfabs_v_f64m2
#define VSQRT_DOUBLEH __riscv_vfsqrt_v_f64m2
#define VCVT_DOUBLEH_FLOATH __riscv_vfncvt_f_f_w_f32m2
#define VCVT_FLOATH_DOUBLEH __riscv_vfwcvt_f_f_v_f64m2
#define VLT1_DOUBLEH_BOOLH __riscv_vmflt_vf_f64m2_b32
static inline vfloat64m2_t VMERGE_DOUBLEH(vbool32_t mask, vfloat64m2_t op1,
vfloat64m2_t op2, size_t vl){
return __riscv_vmerge_vvm_f64m2(op1, op2, mask, vl);
}
static inline vfloat64m2_t VMERGE1_DOUBLEH(vbool32_t mask, vfloat64m2_t op1,
double op2, size_t vl){
return __riscv_vfmerge_vfm_f64m2(op1, op2, mask, vl);
}
#define VMUL1_DOUBLEH_MASK __riscv_vfmul_vf_f64m2_m
#define VEQ1_DOUBLEH_BOOLH __riscv_vmfeq_vf_f64m2_b32
#define VEQ_DOUBLEH_BOOLH __riscv_vmfeq_vv_f64m2_b32
#define VGE1_DOUBLEH_BOOLH __riscv_vmfge_vf_f64m2_b32
#define VGT1_DOUBLEH_BOOLH __riscv_vmfgt_vf_f64m2_b32
#define VNE1_DOUBLEH_BOOLH __riscv_vmfne_vf_f64m2_b32
#define VLT1_DOUBLEH_BOOLH __riscv_vmflt_vf_f64m2_b32
#define VLE1_DOUBLEH_BOOLH __riscv_vmfle_vf_f64m2_b32
#define VADD1_DOUBLEH_MASK __riscv_vfadd_vf_f64m2_m
#define VMUL1_DOUBLEH_MASK __riscv_vfmul_vf_f64m2_m
static inline vfloat64m2_t VADD1_DOUBLEH_MASKEDOFF(vbool32_t mask, vfloat64m2_t op1, vfloat64m2_t maskedoff,
double op2, size_t vl){
op1 = __riscv_vmerge_vvm_f64m2(maskedoff, op1, mask, vl);
return __riscv_vfadd_vf_f64m2_m(mask, op1, op2, vl);
}
// INT64H
#define V_ELT_INT64H vint64m2_t
#define VLOAD1_INT64H __riscv_vmv_v_x_i64m2
#define VSLL1_INT64H __riscv_vsll_vx_i64m2
#define VADD1_INT64H __riscv_vadd_vx_i64m2
#define VAND1_INT64H __riscv_vand_vx_i64m2
#define VXOR_INT64H __riscv_vxor_vv_i64m2
#define VNE1_INTH_BOOL64H __riscv_vmsne_vx_i64m2_b32
#define VEQ1_INTH_BOOL64H __riscv_vmseq_vx_i64m2_b32
#define VEQ1_INTH_BOOL64H __riscv_vmseq_vx_i64m2_b32
#define VGT1_INTH_BOOL64H __riscv_vmsgt_vx_i64m2_b32
#define VSUB1_INT64H_MASK __riscv_vsub_vx_i64m2_m
static inline vint64m2_t VMERGE_INT64H(vbool32_t mask, vint64m2_t op1,
vint64m2_t op2, size_t vl){
return __riscv_vmerge_vvm_i64m2(op1, op2, mask, vl);
}
static inline vint64m2_t VMERGE1_INT64H(vbool32_t mask, vint64m2_t op1,
int64_t op2, size_t vl){
return __riscv_vmerge_vxm_i64m2(op1, op2, mask, vl);
}
#define VADD1_INT64H_MASK __riscv_vadd_vx_i64m2_m
#define VSTORE_INT64H __riscv_vse64_v_i64m2
//BOOL64H
#define V_ELT_BOOL64H vbool32_t
#define VNOT_BOOL64H __riscv_vmnot_m_b32
#define VCLEAR_BOOL64H __riscv_vmclr_m_b32
#define VXOR_BOOL64H __riscv_vmxor_mm_b32
#define VOR_BOOL64H __riscv_vmor_mm_b32
#define VAND_BOOL64H __riscv_vmand_mm_b32
#define VANDNOT_BOOL64H __riscv_vmandn_mm_b32
#endif // ELEN >= 64
#endif
#endif // RISCV
#ifdef ALTIVEC
#include <altivec.h>
#endif
#ifdef _MSC_VER /* visual c++ */
#define ALIGN16_BEG __declspec(align(16))
#define ALIGN16_END
#define ALIGN32_BEG
#define ALIGN32_END __declspec(align(32))
#define ALIGN64_BEG
#define ALIGN64_END __declspec(align(64))
#else /* gcc,icc, clang */
#define ALIGN16_BEG
#define ALIGN16_END __attribute__((aligned(16)))
#define ALIGN32_BEG
#define ALIGN32_END __attribute__((aligned(32)))
#define ALIGN64_BEG
#define ALIGN64_END __attribute__((aligned(64)))
#endif
static const float FOPI = 1.27323954473516f;
static const float PIO4F = 0.7853981633974483096f;
static const double FOPId = 1.2732395447351626861510701069801148;