Miscellaneous Operations for Streaming SIMD Extensions 2

The miscellaneous intrinsics for Streaming SIMD Extensions 2 (SSE2) are listed in the following table followed by their descriptions.

The prototypes for SSE2 intrinsics are in the emmintrin.h header file.

Intrinsic	Corresponding Instruction	Operation
_mm_packs_epi16	PACKSSWB	Packed Saturation
_mm_packs_epi32	PACKSSDW	Packed Saturation
_mm_packus_epi16	PACKUSWB	Packed Saturation
_mm_extract_epi16	PEXTRW	Extraction
_mm_insert_epi16	PINSRW	Insertion
_mm_movemask_epi8	PMOVMSKB	Mask Creation
_mm_shuffle_epi32	PSHUFD	Shuffle
_mm_shufflehi_epi16	PSHUFHW	Shuffle
_mm_shufflelo_epi16	PSHUFLW	Shuffle
_mm_unpackhi_epi8	PUNPCKHBW	Interleave
_mm_unpackhi_epi16	PUNPCKHWD	Interleave
_mm_unpackhi_epi32	PUNPCKHDQ	Interleave
_mm_unpackhi_epi64	PUNPCKHQDQ	Interleave
_mm_unpacklo_epi8	PUNPCKLBW	Interleave
_mm_unpacklo_epi16	PUNPCKLWD	Interleave
_mm_unpacklo_epi32	PUNPCKLDQ	Interleave
_mm_unpacklo_epi64	PUNPCKLQDQ	Interleave
_mm_movepi64_pi64	MOVDQ2Q	move
_m128i_mm_movpi64_epi64	MOVQ2DQ	move
_mm_move_epi64	MOVQ	move

__m128i _mm_packs_epi16(__m128i a, __m128i b)

Packs the 16 signed 16-bit integers from a and b into 8-bit integers and saturates.
r0 := SignedSaturate(a0)
r1 := SignedSaturate(a1)
...
r7 := SignedSaturate(a7)
r8 := SignedSaturate(b0)
r9 := SignedSaturate(b1)
...
r15 := SignedSaturate(b7)

__m128i _mm_packs_epi32(__m128i a, __m128i b)

Packs the 8 signed 32-bit integers from a and b into signed 16-bit integers and saturates.
r0 := SignedSaturate(a0)
r1 := SignedSaturate(a1)
r2 := SignedSaturate(a2)
r3 := SignedSaturate(a3)
r4 := SignedSaturate(b0)
r5 := SignedSaturate(b1)
r6 := SignedSaturate(b2)
r7 := SignedSaturate(b3)

__m128i _mm_packus_epi16(__m128i a, __m128i b)

Packs the 16 signed 16-bit integers from a and b into 8-bit unsigned integers and saturates.
r0 := UnsignedSaturate(a0)
r1 := UnsignedSaturate(a1)
...
r7 := UnsignedSaturate(a7)
r8 := UnsignedSaturate(b0)
r9 := UnsignedSaturate(b1)
...
r15 := UnsignedSaturate(b7)

int _mm_extract_epi16(__m128i a, int imm)

Extracts the selected signed or unsigned 16-bit integer from a and zero extends. The selector imm must be an immediate.
r := (imm == 0) ? a0 :
( (imm == 1) ? a1 :
...
(imm == 7) ? a7 )

__m128i _mm_insert_epi16(__m128i a, int b, int imm)

Inserts the least significant 16 bits of b into the selected 16-bit integer of a. The selector imm must be an immediate.
r0 := (imm == 0) ? b : a0;
r1 := (imm == 1) ? b : a1;
...
r7 := (imm == 7) ? b : a7;

int _mm_movemask_epi8(__m128i a)

Creates a 16-bit mask from the most significant bits of the 16 signed or unsigned 8-bit integers in a and zero extends the upper bits.
r := a15[7] << 15 |
a14[7] << 14 |
...
a1[7] << 1 |
a0[7]

__m128i _mm_shuffle_epi32(__m128i a, int imm)

Shuffles the 4 signed or unsigned 32-bit integers in a as specified by imm. The shuffle value, imm, must be an immediate. See Macro Function for Shuffle for a description of shuffle semantics.

__m128i _mm_shufflehi_epi16(__m128i a, int imm)

Shuffles the upper 4 signed or unsigned 16-bit integers in a as specified by imm. The shuffle value, imm, must be an immediate. See Macro Function for Shuffle for a description of shuffle semantics.

__m128i _mm_shufflelo_epi16(__m128i a, int imm)

Shuffles the lower 4 signed or unsigned 16-bit integers in a as specified by imm. The shuffle value, imm, must be an immediate. See Macro Function for Shuffle for a description of shuffle semantics.

__m128i _mm_unpackhi_epi8(__m128i a, __m128i b)

Interleaves the upper 8 signed or unsigned 8-bit integers in a with the upper 8 signed or unsigned 8-bit integers in b.
r0 := a8 ; r1 := b8
r2 := a9 ; r3 := b9
...
r14 := a15 ; r15 := b15

__m128i _mm_unpackhi_epi16(__m128i a, __m128i b)

Interleaves the upper 4 signed or unsigned 16-bit integers in a with the upper 4 signed or unsigned 16-bit integers in b.
r0 := a4 ; r1 := b4
r2 := a5 ; r3 := b5
r4 := a6 ; r5 := b6
r6 := a7 ; r7 := b7

__m128i _mm_unpackhi_epi32(__m128i a, __m128i b)

Interleaves the upper 2 signed or unsigned 32-bit integers in a with the upper 2 signed or unsigned 32-bit integers in b.
r0 := a2 ; r1 := b2
r2 := a3 ; r3 := b3

__m128i _mm_unpackhi_epi64(__m128i a, __m128i b)

Interleaves the upper signed or unsigned 64-bit integer in a with the upper signed or unsigned 64-bit integer in b.
r0 := a1 ; r1 := b1

__m128i _mm_unpacklo_epi8(__m128i a, __m128i b)

Interleaves the lower 8 signed or unsigned 8-bit integers in a with the lower 8 signed or unsigned 8-bit integers in b.
r0 := a0 ; r1 := b0
r2 := a1 ; r3 := b1
...
r14 := a7 ; r15 := b7

__m128i _mm_unpacklo_epi16(__m128i a, __m128i b)

Interleaves the lower 4 signed or unsigned 16-bit integers in a with the lower 4 signed or unsigned 16-bit integers in b.
r0 := a0 ; r1 := b0
r2 := a1 ; r3 := b1
r4 := a2 ; r5 := b2
r6 := a3 ; r7 := b3

__m128i _mm_unpacklo_epi32(__m128i a, __m128i b)

Interleaves the lower 2 signed or unsigned 32-bit integers in a with the lower 2 signed or unsigned 32-bit integers in b.
r0 := a0 ; r1 := b0
r2 := a1 ; r3 := b1

__m128i _mm_unpacklo_epi64(__m128i a, __m128i b)

Interleaves the lower signed or unsigned 64-bit integer in a with the lower signed or unsigned 64-bit integer in b.
r0 := a0 ; r1 := b0

__m64 _mm_movepi64_pi64(__m128i a)

Returns the lower 64 bits of a as an __m64 type.
r0 := a0 ;

__128i _mm_movpi64_pi64(__m64 a)

Moves the 64 bits of a to the lower 64 bits of the result, zeroing the upper bits.
r0 := a0 ; r1 := 0X0 ;

__128i _mm_move_epi64(__128i a)

Moves the lower 64 bits of a to the lower 64 bits of the result, zeroing the upper bits.
r0 := a0 ; r1 := 0X0 ;

Additional Miscellaneous Intrinsics

The prototypes for Streaming SIMD Extensions 2 (SSE2) intrinsics are in the emmintrin.h header file.

__m128d _mm_unpackhi_pd(__m128d a, __m128d b)

(uses UNPCKHPD) Interleaves the upper DP FP values of a and b.
r0 := a1
r1 := b1

__m128d _mm_unpacklo_pd(__m128d a, __m128d b)

(uses UNPCKLPD) Interleaves the lower DP FP values of a and b.
r0 := a0
r1 := b0

int _mm_movemask_pd(__m128d a)

(uses MOVMSKPD) Creates a two-bit mask from the sign bits of the two DP FP values of a.
r := sign(a1) << 1 | sign(a0)

__m128d _mm_shuffle_pd(__m128d a, __m128d b, int i)

(uses SHUFPD) Selects two specific DP FP values from a and b, based on the mask i. The mask must be an immediate. See Macro Function for Shuffle for a description of the shuffle semantics.

Intrinsics for Casting Support

This version of the Intel C++ Compiler supports casting between various SP, DP, and INT vector types. These intrinsics do not convert values; they just change the type.

extern __m128 _mm_castpd_ps(__m128d in);

extern __m128i _mm_castpd_si128(__m128d in);

extern __m128d _mm_castps_pd(__m128 in);

extern __m128i _mm_castps_si128(__m128 in);

extern __m128 _mm_castsi128_ps(__m128i in);

extern __m128d _mm_castsi128_pd(__m128i in);