Hướng dẫn thay đổi AVX2 vpsllvd
và vpsrlvd
có thuộc tính đẹp thay đổi số lớn hơn hoặc bằng 32 dẫn đến số nguyên không trong sổ đăng ký ymm. Nói cách khác: số lần dịch chuyển không bị che khuất, ngược lại với số lần dịch chuyển cho các lệnh thay đổi vô hướng x86.
Do đó mã khá đơn giản:
/*
gcc -O3 -m64 -Wall -mavx2 -march=broadwell avx2_bit_mask.c
*/
#include <immintrin.h>
#include <stdio.h>
__m256i bit_mask_avx2_msb(unsigned int n)
{
__m256i ones = _mm256_set1_epi32(-1);
__m256i cnst32_256 = _mm256_set_epi32(32,64,96,128, 160,192,224,256);
__m256i shift = _mm256_set1_epi32(n);
shift = _mm256_subs_epu16(cnst32_256,shift);
return _mm256_sllv_epi32(ones,shift);
}
__m256i bit_mask_avx2_lsb(unsigned int n)
{
__m256i ones = _mm256_set1_epi32(-1);
__m256i cnst32_256 = _mm256_set_epi32(256,224,192,160, 128,96,64,32);
__m256i shift = _mm256_set1_epi32(n);
shift = _mm256_subs_epu16(cnst32_256,shift);
return _mm256_srlv_epi32(ones,shift);
}
int print_avx2_hex(__m256i ymm)
{
long unsigned int x[4];
_mm256_storeu_si256((__m256i*)x,ymm);
printf("%016lX %016lX %016lX %016lX\n", x[3],x[2],x[1],x[0]);
return 0;
}
int main()
{
unsigned int i;
for (i=0;i<259;i++){
printf("bit_mask_avx2_lsb(%3d) ",i);
print_avx2_hex(bit_mask_avx2_lsb(i));
}
printf("\n");
for (i=0;i<259;i++){
printf("bit_mask_avx2_msb(%3d) ",i);
print_avx2_hex(bit_mask_avx2_msb(i));
}
printf("\n");
return 0;
}
Kết quả là:
$ ./a.out
bit_mask_avx2_lsb( 0) 0000000000000000 0000000000000000 0000000000000000 0000000000000000
bit_mask_avx2_lsb( 1) 0000000000000000 0000000000000000 0000000000000000 0000000000000001
bit_mask_avx2_lsb( 2) 0000000000000000 0000000000000000 0000000000000000 0000000000000003
bit_mask_avx2_lsb( 3) 0000000000000000 0000000000000000 0000000000000000 0000000000000007
bit_mask_avx2_lsb( 4) 0000000000000000 0000000000000000 0000000000000000 000000000000000F
bit_mask_avx2_lsb( 5) 0000000000000000 0000000000000000 0000000000000000 000000000000001F
bit_mask_avx2_lsb( 6) 0000000000000000 0000000000000000 0000000000000000 000000000000003F
bit_mask_avx2_lsb( 7) 0000000000000000 0000000000000000 0000000000000000 000000000000007F
bit_mask_avx2_lsb( 8) 0000000000000000 0000000000000000 0000000000000000 00000000000000FF
bit_mask_avx2_lsb( 9) 0000000000000000 0000000000000000 0000000000000000 00000000000001FF
bit_mask_avx2_lsb(10) 0000000000000000 0000000000000000 0000000000000000 00000000000003FF
bit_mask_avx2_lsb(11) 0000000000000000 0000000000000000 0000000000000000 00000000000007FF
...
bit_mask_avx2_lsb(124) 0000000000000000 0000000000000000 0FFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF
bit_mask_avx2_lsb(125) 0000000000000000 0000000000000000 1FFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF
bit_mask_avx2_lsb(126) 0000000000000000 0000000000000000 3FFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF
bit_mask_avx2_lsb(127) 0000000000000000 0000000000000000 7FFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF
bit_mask_avx2_lsb(128) 0000000000000000 0000000000000000 FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF
bit_mask_avx2_lsb(129) 0000000000000000 0000000000000001 FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF
bit_mask_avx2_lsb(130) 0000000000000000 0000000000000003 FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF
bit_mask_avx2_lsb(131) 0000000000000000 0000000000000007 FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF
bit_mask_avx2_lsb(132) 0000000000000000 000000000000000F FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF
...
bit_mask_avx2_lsb(248) 00FFFFFFFFFFFFFF FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF
bit_mask_avx2_lsb(249) 01FFFFFFFFFFFFFF FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF
bit_mask_avx2_lsb(250) 03FFFFFFFFFFFFFF FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF
bit_mask_avx2_lsb(251) 07FFFFFFFFFFFFFF FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF
bit_mask_avx2_lsb(252) 0FFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF
bit_mask_avx2_lsb(253) 1FFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF
bit_mask_avx2_lsb(254) 3FFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF
bit_mask_avx2_lsb(255) 7FFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF
bit_mask_avx2_lsb(256) FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF
bit_mask_avx2_lsb(257) FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF
bit_mask_avx2_lsb(258) FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF
bit_mask_avx2_msb( 0) 0000000000000000 0000000000000000 0000000000000000 0000000000000000
bit_mask_avx2_msb( 1) 8000000000000000 0000000000000000 0000000000000000 0000000000000000
bit_mask_avx2_msb( 2) C000000000000000 0000000000000000 0000000000000000 0000000000000000
bit_mask_avx2_msb( 3) E000000000000000 0000000000000000 0000000000000000 0000000000000000
bit_mask_avx2_msb( 4) F000000000000000 0000000000000000 0000000000000000 0000000000000000
bit_mask_avx2_msb( 5) F800000000000000 0000000000000000 0000000000000000 0000000000000000
bit_mask_avx2_msb( 6) FC00000000000000 0000000000000000 0000000000000000 0000000000000000
bit_mask_avx2_msb( 7) FE00000000000000 0000000000000000 0000000000000000 0000000000000000
bit_mask_avx2_msb( 8) FF00000000000000 0000000000000000 0000000000000000 0000000000000000
bit_mask_avx2_msb( 9) FF80000000000000 0000000000000000 0000000000000000 0000000000000000
bit_mask_avx2_msb(10) FFC0000000000000 0000000000000000 0000000000000000 0000000000000000
bit_mask_avx2_msb(11) FFE0000000000000 0000000000000000 0000000000000000 0000000000000000
...
bit_mask_avx2_msb(124) FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFF0 0000000000000000 0000000000000000
bit_mask_avx2_msb(125) FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFF8 0000000000000000 0000000000000000
bit_mask_avx2_msb(126) FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFC 0000000000000000 0000000000000000
bit_mask_avx2_msb(127) FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFE 0000000000000000 0000000000000000
bit_mask_avx2_msb(128) FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF 0000000000000000 0000000000000000
bit_mask_avx2_msb(129) FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF 8000000000000000 0000000000000000
bit_mask_avx2_msb(130) FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF C000000000000000 0000000000000000
bit_mask_avx2_msb(131) FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF E000000000000000 0000000000000000
bit_mask_avx2_msb(132) FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF F000000000000000 0000000000000000
...
bit_mask_avx2_msb(248) FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF FFFFFFFFFFFFFF00
bit_mask_avx2_msb(249) FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF FFFFFFFFFFFFFF80
bit_mask_avx2_msb(250) FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFC0
bit_mask_avx2_msb(251) FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFE0
bit_mask_avx2_msb(252) FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFF0
bit_mask_avx2_msb(253) FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFF8
bit_mask_avx2_msb(254) FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFC
bit_mask_avx2_msb(255) FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFE
bit_mask_avx2_msb(256) FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF
bit_mask_avx2_msb(257) FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF
bit_mask_avx2_msb(258) FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF
Đối với một giá trị n
, với 256 < = n
< = 65535, tất cả các bit được thiết lập để một, như người ta có thể mong đợi. Giới hạn trên của 65535 là do số học bão hòa 16 bit của _mm256_subs_epu16()
. Với n
= 65536 bitmask (giá trị đầu ra) bằng không. Có thể sửa đổi mã sao cho tất cả các bit được đặt thành một số cho phạm vi 256 < = n
< = INT_MAX
. này có thể đạt được bằng cách thay thế shift = _mm256_subs_epu16(cnst32_256,shift);
với
__m256i mask = _mm256_cmpgt_epi32(cnst32_256,shift);
shift = _mm256_sub_epi32(cnst32_256,shift);
shift = _mm256_and_si256(shift,mask);
Ba intrinsics nhiều hay ít thi đua _mm256_subs_epu32(cnst32_256,shift)
, mà không tồn tại.
Rất hay. Điều này chỉ đắt hơn một chút so với kỹ thuật tải không cửa sổ trượt không hoạt động tốt cho mặt nạ byte (không phải mặt nạ bit). Trong một số trường hợp thậm chí có thể thích hợp hơn. –