whitespace fixes

This commit is contained in:
Wim Taymans 2009-08-20 10:56:20 +02:00
parent 3cc1278dcf
commit f09b51198f
8 changed files with 695 additions and 670 deletions

View file

@ -36,7 +36,7 @@
#if defined (__arm__) && defined (__linux__)
#define MAX_BUFFER 4096
#define MAX_BUFFER 4096
static char *
get_cpuinfo_line (char *cpuinfo, const char *tag) {
char *line, *end, *colon;
@ -106,20 +106,20 @@ void pa_cpu_init_arm (void) {
}
/* get the CPU features */
if ((line = get_cpuinfo_line (cpuinfo, "Features"))) {
char *state = NULL, *current;
char *state = NULL, *current;
while ((current = pa_split_spaces (line, &state))) {
if (!strcmp (current, "vfp"))
flags |= PA_CPU_ARM_VFP;
else if (!strcmp (current, "edsp"))
flags |= PA_CPU_ARM_EDSP;
else if (!strcmp (current, "neon"))
flags |= PA_CPU_ARM_NEON;
else if (!strcmp (current, "vfpv3"))
flags |= PA_CPU_ARM_VFPV3;
while ((current = pa_split_spaces (line, &state))) {
if (!strcmp (current, "vfp"))
flags |= PA_CPU_ARM_VFP;
else if (!strcmp (current, "edsp"))
flags |= PA_CPU_ARM_EDSP;
else if (!strcmp (current, "neon"))
flags |= PA_CPU_ARM_NEON;
else if (!strcmp (current, "vfpv3"))
flags |= PA_CPU_ARM_VFPV3;
free (current);
}
free (current);
}
}
free (cpuinfo);

View file

@ -34,14 +34,15 @@
static void
get_cpuid (uint32_t op, uint32_t *a, uint32_t *b, uint32_t *c, uint32_t *d)
{
__asm__ __volatile__ (
" push %%"PA_REG_b" \n\t"
" cpuid \n\t"
" mov %%ebx, %%esi \n\t"
" pop %%"PA_REG_b" \n\t"
__asm__ __volatile__ (
" push %%"PA_REG_b" \n\t"
" cpuid \n\t"
" mov %%ebx, %%esi \n\t"
" pop %%"PA_REG_b" \n\t"
: "=a" (*a), "=S" (*b), "=c" (*c), "=d" (*d)
: "0" (op));
: "=a" (*a), "=S" (*b), "=c" (*c), "=d" (*d)
: "0" (op)
);
}
#endif
@ -97,23 +98,23 @@ void pa_cpu_init_x86 (void) {
}
pa_log_info ("CPU flags: %s%s%s%s%s%s%s%s%s%s",
(flags & PA_CPU_X86_MMX) ? "MMX " : "",
(flags & PA_CPU_X86_SSE) ? "SSE " : "",
(flags & PA_CPU_X86_SSE2) ? "SSE2 " : "",
(flags & PA_CPU_X86_SSE3) ? "SSE3 " : "",
(flags & PA_CPU_X86_SSSE3) ? "SSSE3 " : "",
(flags & PA_CPU_X86_SSE4_1) ? "SSE4_1 " : "",
(flags & PA_CPU_X86_SSE4_2) ? "SSE4_2 " : "",
(flags & PA_CPU_X86_MMXEXT) ? "MMXEXT " : "",
(flags & PA_CPU_X86_3DNOW) ? "3DNOW " : "",
(flags & PA_CPU_X86_3DNOWEXT) ? "3DNOWEXT " : "");
(flags & PA_CPU_X86_MMX) ? "MMX " : "",
(flags & PA_CPU_X86_SSE) ? "SSE " : "",
(flags & PA_CPU_X86_SSE2) ? "SSE2 " : "",
(flags & PA_CPU_X86_SSE3) ? "SSE3 " : "",
(flags & PA_CPU_X86_SSSE3) ? "SSSE3 " : "",
(flags & PA_CPU_X86_SSE4_1) ? "SSE4_1 " : "",
(flags & PA_CPU_X86_SSE4_2) ? "SSE4_2 " : "",
(flags & PA_CPU_X86_MMXEXT) ? "MMXEXT " : "",
(flags & PA_CPU_X86_3DNOW) ? "3DNOW " : "",
(flags & PA_CPU_X86_3DNOWEXT) ? "3DNOWEXT " : "");
/* activate various optimisations */
if (flags & PA_CPU_X86_MMX) {
if (flags & PA_CPU_X86_MMX)
pa_volume_func_init_mmx (flags);
}
if (flags & PA_CPU_X86_SSE) {
pa_volume_func_init_sse (flags);
}
if (flags & PA_CPU_X86_SSE)
pa_volume_func_init_sse (flags);
#endif /* defined (__i386__) || defined (__amd64__) */
}

View file

@ -1065,30 +1065,53 @@ static pa_memchunk* convert_to_work_format(pa_resampler *r, pa_memchunk *input)
}
static void remap_mono_to_stereo(pa_resampler *r, void *dst, const void *src, unsigned n) {
unsigned i;
switch (r->work_format) {
case PA_SAMPLE_FLOAT32NE:
{
float *d, *s;
d = (float *) dst;
s = (float *) src;
d = (float *) dst;
s = (float *) src;
for (; n > 0; n--, s++, d += 2)
d[0] = d[1] = *s;
break;
}
for (i = n >> 2; i; i--) {
d[0] = d[1] = s[0];
d[2] = d[3] = s[1];
d[4] = d[5] = s[2];
d[6] = d[7] = s[3];
s += 4;
d += 8;
}
for (i = n & 3; i; i--) {
d[0] = d[1] = s[0];
s++;
d += 2;
}
break;
}
case PA_SAMPLE_S16NE:
{
int16_t *d, *s;
d = (int16_t *) dst;
s = (int16_t *) src;
d = (int16_t *) dst;
s = (int16_t *) src;
for (; n > 0; n--, s++, d += 2)
d[0] = d[1] = *s;
break;
}
for (i = n >> 2; i; i--) {
d[0] = d[1] = s[0];
d[2] = d[3] = s[1];
d[4] = d[5] = s[2];
d[6] = d[7] = s[3];
s += 4;
d += 8;
}
for (i = n & 3; i; i--) {
d[0] = d[1] = s[0];
s++;
d += 2;
}
break;
}
default:
pa_assert_not_reached();
}
@ -1114,7 +1137,7 @@ static void remap_channels_matrix (pa_resampler *r, void *dst, const void *src,
for (ic = 0; ic < n_ic; ic++) {
float vol;
vol = r->map_table_f[oc][ic];
vol = r->map_table_f[oc][ic];
if (vol <= 0.0)
continue;
@ -1122,18 +1145,18 @@ static void remap_channels_matrix (pa_resampler *r, void *dst, const void *src,
d = (float *)dst + oc;
s = (float *)src + ic;
if (vol >= 1.0) {
if (vol >= 1.0) {
for (i = n; i > 0; i--, s += n_ic, d += n_oc)
*d += *s;
} else {
} else {
for (i = n; i > 0; i--, s += n_ic, d += n_oc)
*d += *s * vol;
}
}
}
}
break;
}
}
case PA_SAMPLE_S16NE:
{
int16_t *d, *s;
@ -1144,7 +1167,7 @@ static void remap_channels_matrix (pa_resampler *r, void *dst, const void *src,
for (ic = 0; ic < n_ic; ic++) {
int32_t vol;
vol = r->map_table_i[oc][ic];
vol = r->map_table_i[oc][ic];
if (vol <= 0)
continue;
@ -1158,11 +1181,11 @@ static void remap_channels_matrix (pa_resampler *r, void *dst, const void *src,
} else {
for (i = n; i > 0; i--, s += n_ic, d += n_oc)
*d += (int16_t) (((int32_t)*s * vol) >> 16);
}
}
}
}
break;
}
}
default:
pa_assert_not_reached();
}

View file

@ -752,12 +752,13 @@ void pa_volume_memchunk(
return;
}
ptr = (uint8_t*) pa_memblock_acquire(c->memblock) + c->index;
do_volume = pa_get_volume_func (spec->format);
pa_assert(do_volume);
calc_volume_table[spec->format] ((void *)linear, volume);
ptr = (uint8_t*) pa_memblock_acquire(c->memblock) + c->index;
do_volume (ptr, (void *)linear, spec->channels, c->length);
pa_memblock_release(c->memblock);
@ -944,12 +945,12 @@ void pa_sample_clamp(pa_sample_format_t format, void *dst, size_t dstr, const vo
for (; n > 0; n--) {
float f;
f = *s;
f = *s;
*d = PA_CLAMP_UNLIKELY(f, -1.0f, 1.0f);
s = (const float*) ((const uint8_t*) s + sstr);
d = (float*) ((uint8_t*) d + dstr);
}
}
} else {
pa_assert(format == PA_SAMPLE_FLOAT32RE);

View file

@ -45,81 +45,81 @@
static void
pa_volume_s16ne_arm (int16_t *samples, int32_t *volumes, unsigned channels, unsigned length)
{
int32_t *ve;
int32_t *ve;
channels = MAX (4, channels);
ve = volumes + channels;
channels = MAX (4, channels);
ve = volumes + channels;
__asm__ __volatile__ (
" mov r6, %1 \n\t"
" mov %3, %3, LSR #1 \n\t" /* length /= sizeof (int16_t) */
" tst %3, #1 \n\t" /* check for odd samples */
" beq 2f \n\t"
__asm__ __volatile__ (
" mov r6, %1 \n\t"
" mov %3, %3, LSR #1 \n\t" /* length /= sizeof (int16_t) */
" tst %3, #1 \n\t" /* check for odd samples */
" beq 2f \n\t"
"1: \n\t"
" ldr r0, [r6], #4 \n\t" /* odd samples volumes */
" ldrh r2, [%0] \n\t"
"1: \n\t"
" ldr r0, [r6], #4 \n\t" /* odd samples volumes */
" ldrh r2, [%0] \n\t"
" smulwb r0, r0, r2 \n\t"
" ssat r0, #16, r0 \n\t"
" smulwb r0, r0, r2 \n\t"
" ssat r0, #16, r0 \n\t"
" strh r0, [%0], #2 \n\t"
" strh r0, [%0], #2 \n\t"
MOD_INC()
MOD_INC()
"2: \n\t"
" mov %3, %3, LSR #1 \n\t"
" tst %3, #1 \n\t" /* check for odd samples */
" beq 4f \n\t"
"2: \n\t"
" mov %3, %3, LSR #1 \n\t"
" tst %3, #1 \n\t" /* check for odd samples */
" beq 4f \n\t"
"3: \n\t"
" ldrd r2, [r6], #8 \n\t" /* 2 samples at a time */
" ldr r0, [%0] \n\t"
"3: \n\t"
" ldrd r2, [r6], #8 \n\t" /* 2 samples at a time */
" ldr r0, [%0] \n\t"
" smulwt r2, r2, r0 \n\t"
" smulwb r3, r3, r0 \n\t"
" smulwt r2, r2, r0 \n\t"
" smulwb r3, r3, r0 \n\t"
" ssat r2, #16, r2 \n\t"
" ssat r3, #16, r3 \n\t"
" ssat r2, #16, r2 \n\t"
" ssat r3, #16, r3 \n\t"
" pkhbt r0, r3, r2, LSL #16 \n\t"
" str r0, [%0], #4 \n\t"
" pkhbt r0, r3, r2, LSL #16 \n\t"
" str r0, [%0], #4 \n\t"
MOD_INC()
MOD_INC()
"4: \n\t"
" movs %3, %3, LSR #1 \n\t"
" beq 6f \n\t"
"4: \n\t"
" movs %3, %3, LSR #1 \n\t"
" beq 6f \n\t"
"5: \n\t"
" ldrd r2, [r6], #8 \n\t" /* 4 samples at a time */
" ldrd r4, [r6], #8 \n\t"
" ldrd r0, [%0] \n\t"
"5: \n\t"
" ldrd r2, [r6], #8 \n\t" /* 4 samples at a time */
" ldrd r4, [r6], #8 \n\t"
" ldrd r0, [%0] \n\t"
" smulwt r2, r2, r0 \n\t"
" smulwb r3, r3, r0 \n\t"
" smulwt r4, r4, r1 \n\t"
" smulwb r5, r5, r1 \n\t"
" smulwt r2, r2, r0 \n\t"
" smulwb r3, r3, r0 \n\t"
" smulwt r4, r4, r1 \n\t"
" smulwb r5, r5, r1 \n\t"
" ssat r2, #16, r2 \n\t"
" ssat r3, #16, r3 \n\t"
" ssat r4, #16, r4 \n\t"
" ssat r5, #16, r5 \n\t"
" ssat r2, #16, r2 \n\t"
" ssat r3, #16, r3 \n\t"
" ssat r4, #16, r4 \n\t"
" ssat r5, #16, r5 \n\t"
" pkhbt r0, r3, r2, LSL #16 \n\t"
" pkhbt r1, r5, r4, LSL #16 \n\t"
" strd r0, [%0], #8 \n\t"
" pkhbt r0, r3, r2, LSL #16 \n\t"
" pkhbt r1, r5, r4, LSL #16 \n\t"
" strd r0, [%0], #8 \n\t"
MOD_INC()
MOD_INC()
" subs %3, %3, #1 \n\t"
" bne 5b \n\t"
"6: \n\t"
" subs %3, %3, #1 \n\t"
" bne 5b \n\t"
"6: \n\t"
: "+r" (samples), "+r" (volumes), "+r" (ve), "+r" (length)
:
: "r6", "r5", "r4", "r3", "r2", "r1", "r0", "cc"
);
: "+r" (samples), "+r" (volumes), "+r" (ve), "+r" (length)
:
: "r6", "r5", "r4", "r3", "r2", "r1", "r0", "cc"
);
}
#undef RUN_TEST
@ -131,51 +131,51 @@ pa_volume_s16ne_arm (int16_t *samples, int32_t *volumes, unsigned channels, unsi
#define PADDING 16
static void run_test (void) {
int16_t samples[SAMPLES];
int16_t samples_ref[SAMPLES];
int16_t samples_orig[SAMPLES];
int32_t volumes[CHANNELS + PADDING];
int i, j, padding;
pa_do_volume_func_t func;
struct timeval start, stop;
int16_t samples[SAMPLES];
int16_t samples_ref[SAMPLES];
int16_t samples_orig[SAMPLES];
int32_t volumes[CHANNELS + PADDING];
int i, j, padding;
pa_do_volume_func_t func;
struct timeval start, stop;
func = pa_get_volume_func (PA_SAMPLE_S16NE);
func = pa_get_volume_func (PA_SAMPLE_S16NE);
printf ("checking ARM %zd\n", sizeof (samples));
printf ("checking ARM %zd\n", sizeof (samples));
pa_random (samples, sizeof (samples));
memcpy (samples_ref, samples, sizeof (samples));
memcpy (samples_orig, samples, sizeof (samples));
pa_random (samples, sizeof (samples));
memcpy (samples_ref, samples, sizeof (samples));
memcpy (samples_orig, samples, sizeof (samples));
for (i = 0; i < CHANNELS; i++)
volumes[i] = rand() >> 1;
for (padding = 0; padding < PADDING; padding++, i++)
volumes[i] = volumes[padding];
for (i = 0; i < CHANNELS; i++)
volumes[i] = rand() >> 1;
for (padding = 0; padding < PADDING; padding++, i++)
volumes[i] = volumes[padding];
func (samples_ref, volumes, CHANNELS, sizeof (samples));
pa_volume_s16ne_arm (samples, volumes, CHANNELS, sizeof (samples));
for (i = 0; i < SAMPLES; i++) {
if (samples[i] != samples_ref[i]) {
printf ("%d: %04x != %04x (%04x * %04x)\n", i, samples[i], samples_ref[i],
samples_orig[i], volumes[i % CHANNELS]);
}
}
pa_gettimeofday(&start);
for (j = 0; j < TIMES; j++) {
memcpy (samples, samples_orig, sizeof (samples));
pa_volume_s16ne_arm (samples, volumes, CHANNELS, sizeof (samples));
}
pa_gettimeofday(&stop);
pa_log_info("ARM: %llu usec.", (long long unsigned int)pa_timeval_diff (&stop, &start));
pa_gettimeofday(&start);
for (j = 0; j < TIMES; j++) {
memcpy (samples_ref, samples_orig, sizeof (samples));
func (samples_ref, volumes, CHANNELS, sizeof (samples));
}
pa_gettimeofday(&stop);
pa_log_info("ref: %llu usec.", (long long unsigned int)pa_timeval_diff (&stop, &start));
pa_volume_s16ne_arm (samples, volumes, CHANNELS, sizeof (samples));
for (i = 0; i < SAMPLES; i++) {
if (samples[i] != samples_ref[i]) {
printf ("%d: %04x != %04x (%04x * %04x)\n", i, samples[i], samples_ref[i],
samples_orig[i], volumes[i % CHANNELS]);
}
}
pa_gettimeofday(&start);
for (j = 0; j < TIMES; j++) {
memcpy (samples, samples_orig, sizeof (samples));
pa_volume_s16ne_arm (samples, volumes, CHANNELS, sizeof (samples));
}
pa_gettimeofday(&stop);
pa_log_info("ARM: %llu usec.", (long long unsigned int)pa_timeval_diff (&stop, &start));
pa_gettimeofday(&start);
for (j = 0; j < TIMES; j++) {
memcpy (samples_ref, samples_orig, sizeof (samples));
func (samples_ref, volumes, CHANNELS, sizeof (samples));
}
pa_gettimeofday(&stop);
pa_log_info("ref: %llu usec.", (long long unsigned int)pa_timeval_diff (&stop, &start));
}
#endif
@ -184,12 +184,12 @@ static void run_test (void) {
void pa_volume_func_init_arm (pa_cpu_arm_flag_t flags) {
#if defined (__arm__)
pa_log_info("Initialising ARM optimized functions.");
pa_log_info("Initialising ARM optimized functions.");
#ifdef RUN_TEST
run_test ();
run_test ();
#endif
pa_set_volume_func (PA_SAMPLE_S16NE, (pa_do_volume_func_t) pa_volume_s16ne_arm);
pa_set_volume_func (PA_SAMPLE_S16NE, (pa_do_volume_func_t) pa_volume_s16ne_arm);
#endif /* defined (__arm__) */
}

View file

@ -35,289 +35,289 @@
static void
pa_volume_u8_c (uint8_t *samples, int32_t *volumes, unsigned channels, unsigned length)
{
unsigned channel;
unsigned channel;
for (channel = 0; length; length--) {
int32_t t, hi, lo;
for (channel = 0; length; length--) {
int32_t t, hi, lo;
hi = volumes[channel] >> 16;
lo = volumes[channel] & 0xFFFF;
hi = volumes[channel] >> 16;
lo = volumes[channel] & 0xFFFF;
t = (int32_t) *samples - 0x80;
t = ((t * lo) >> 16) + (t * hi);
t = PA_CLAMP_UNLIKELY(t, -0x80, 0x7F);
*samples++ = (uint8_t) (t + 0x80);
t = (int32_t) *samples - 0x80;
t = ((t * lo) >> 16) + (t * hi);
t = PA_CLAMP_UNLIKELY(t, -0x80, 0x7F);
*samples++ = (uint8_t) (t + 0x80);
if (PA_UNLIKELY(++channel >= channels))
channel = 0;
}
if (PA_UNLIKELY(++channel >= channels))
channel = 0;
}
}
static void
pa_volume_alaw_c (uint8_t *samples, int32_t *volumes, unsigned channels, unsigned length)
{
unsigned channel;
unsigned channel;
for (channel = 0; length; length--) {
int32_t t, hi, lo;
for (channel = 0; length; length--) {
int32_t t, hi, lo;
hi = volumes[channel] >> 16;
lo = volumes[channel] & 0xFFFF;
hi = volumes[channel] >> 16;
lo = volumes[channel] & 0xFFFF;
t = (int32_t) st_alaw2linear16(*samples);
t = ((t * lo) >> 16) + (t * hi);
t = PA_CLAMP_UNLIKELY(t, -0x8000, 0x7FFF);
*samples++ = (uint8_t) st_13linear2alaw((int16_t) t >> 3);
t = (int32_t) st_alaw2linear16(*samples);
t = ((t * lo) >> 16) + (t * hi);
t = PA_CLAMP_UNLIKELY(t, -0x8000, 0x7FFF);
*samples++ = (uint8_t) st_13linear2alaw((int16_t) t >> 3);
if (PA_UNLIKELY(++channel >= channels))
channel = 0;
}
if (PA_UNLIKELY(++channel >= channels))
channel = 0;
}
}
static void
pa_volume_ulaw_c (uint8_t *samples, int32_t *volumes, unsigned channels, unsigned length)
{
unsigned channel;
unsigned channel;
for (channel = 0; length; length--) {
int32_t t, hi, lo;
for (channel = 0; length; length--) {
int32_t t, hi, lo;
hi = volumes[channel] >> 16;
lo = volumes[channel] & 0xFFFF;
hi = volumes[channel] >> 16;
lo = volumes[channel] & 0xFFFF;
t = (int32_t) st_ulaw2linear16(*samples);
t = ((t * lo) >> 16) + (t * hi);
t = PA_CLAMP_UNLIKELY(t, -0x8000, 0x7FFF);
*samples++ = (uint8_t) st_14linear2ulaw((int16_t) t >> 2);
t = (int32_t) st_ulaw2linear16(*samples);
t = ((t * lo) >> 16) + (t * hi);
t = PA_CLAMP_UNLIKELY(t, -0x8000, 0x7FFF);
*samples++ = (uint8_t) st_14linear2ulaw((int16_t) t >> 2);
if (PA_UNLIKELY(++channel >= channels))
channel = 0;
}
if (PA_UNLIKELY(++channel >= channels))
channel = 0;
}
}
static void
pa_volume_s16ne_c (int16_t *samples, int32_t *volumes, unsigned channels, unsigned length)
{
unsigned channel;
unsigned channel;
length /= sizeof (int16_t);
length /= sizeof (int16_t);
for (channel = 0; length; length--) {
int32_t t, hi, lo;
for (channel = 0; length; length--) {
int32_t t, hi, lo;
/* Multiplying the 32bit volume factor with the 16bit
* sample might result in an 48bit value. We want to
* do without 64 bit integers and hence do the
* multiplication independantly for the HI and LO part
* of the volume. */
/* Multiplying the 32bit volume factor with the 16bit
* sample might result in an 48bit value. We want to
* do without 64 bit integers and hence do the
* multiplication independantly for the HI and LO part
* of the volume. */
hi = volumes[channel] >> 16;
lo = volumes[channel] & 0xFFFF;
hi = volumes[channel] >> 16;
lo = volumes[channel] & 0xFFFF;
t = (int32_t)(*samples);
t = ((t * lo) >> 16) + (t * hi);
t = PA_CLAMP_UNLIKELY(t, -0x8000, 0x7FFF);
*samples++ = (int16_t) t;
t = (int32_t)(*samples);
t = ((t * lo) >> 16) + (t * hi);
t = PA_CLAMP_UNLIKELY(t, -0x8000, 0x7FFF);
*samples++ = (int16_t) t;
if (PA_UNLIKELY(++channel >= channels))
channel = 0;
}
if (PA_UNLIKELY(++channel >= channels))
channel = 0;
}
}
static void
pa_volume_s16re_c (int16_t *samples, int32_t *volumes, unsigned channels, unsigned length)
{
unsigned channel;
unsigned channel;
length /= sizeof (int16_t);
length /= sizeof (int16_t);
for (channel = 0; length; length--) {
int32_t t, hi, lo;
for (channel = 0; length; length--) {
int32_t t, hi, lo;
hi = volumes[channel] >> 16;
lo = volumes[channel] & 0xFFFF;
hi = volumes[channel] >> 16;
lo = volumes[channel] & 0xFFFF;
t = (int32_t) PA_INT16_SWAP(*samples);
t = ((t * lo) >> 16) + (t * hi);
t = PA_CLAMP_UNLIKELY(t, -0x8000, 0x7FFF);
*samples++ = PA_INT16_SWAP((int16_t) t);
t = (int32_t) PA_INT16_SWAP(*samples);
t = ((t * lo) >> 16) + (t * hi);
t = PA_CLAMP_UNLIKELY(t, -0x8000, 0x7FFF);
*samples++ = PA_INT16_SWAP((int16_t) t);
if (PA_UNLIKELY(++channel >= channels))
channel = 0;
}
if (PA_UNLIKELY(++channel >= channels))
channel = 0;
}
}
static void
pa_volume_float32ne_c (float *samples, float *volumes, unsigned channels, unsigned length)
{
unsigned channel;
unsigned channel;
length /= sizeof (float);
length /= sizeof (float);
for (channel = 0; length; length--) {
*samples++ *= volumes[channel];
for (channel = 0; length; length--) {
*samples++ *= volumes[channel];
if (PA_UNLIKELY(++channel >= channels))
channel = 0;
}
if (PA_UNLIKELY(++channel >= channels))
channel = 0;
}
}
static void
pa_volume_float32re_c (float *samples, float *volumes, unsigned channels, unsigned length)
{
unsigned channel;
unsigned channel;
length /= sizeof (float);
length /= sizeof (float);
for (channel = 0; length; length--) {
float t;
for (channel = 0; length; length--) {
float t;
t = PA_FLOAT32_SWAP(*samples);
t *= volumes[channel];
*samples++ = PA_FLOAT32_SWAP(t);
t = PA_FLOAT32_SWAP(*samples);
t *= volumes[channel];
*samples++ = PA_FLOAT32_SWAP(t);
if (PA_UNLIKELY(++channel >= channels))
channel = 0;
}
if (PA_UNLIKELY(++channel >= channels))
channel = 0;
}
}
static void
pa_volume_s32ne_c (int32_t *samples, int32_t *volumes, unsigned channels, unsigned length)
{
unsigned channel;
unsigned channel;
length /= sizeof (int32_t);
length /= sizeof (int32_t);
for (channel = 0; length; length--) {
int64_t t;
for (channel = 0; length; length--) {
int64_t t;
t = (int64_t)(*samples);
t = (t * volumes[channel]) >> 16;
t = PA_CLAMP_UNLIKELY(t, -0x80000000LL, 0x7FFFFFFFLL);
*samples++ = (int32_t) t;
t = (int64_t)(*samples);
t = (t * volumes[channel]) >> 16;
t = PA_CLAMP_UNLIKELY(t, -0x80000000LL, 0x7FFFFFFFLL);
*samples++ = (int32_t) t;
if (PA_UNLIKELY(++channel >= channels))
channel = 0;
}
if (PA_UNLIKELY(++channel >= channels))
channel = 0;
}
}
static void
pa_volume_s32re_c (int32_t *samples, int32_t *volumes, unsigned channels, unsigned length)
{
unsigned channel;
unsigned channel;
length /= sizeof (int32_t);
length /= sizeof (int32_t);
for (channel = 0; length; length--) {
int64_t t;
for (channel = 0; length; length--) {
int64_t t;
t = (int64_t) PA_INT32_SWAP(*samples);
t = (t * volumes[channel]) >> 16;
t = PA_CLAMP_UNLIKELY(t, -0x80000000LL, 0x7FFFFFFFLL);
*samples++ = PA_INT32_SWAP((int32_t) t);
t = (int64_t) PA_INT32_SWAP(*samples);
t = (t * volumes[channel]) >> 16;
t = PA_CLAMP_UNLIKELY(t, -0x80000000LL, 0x7FFFFFFFLL);
*samples++ = PA_INT32_SWAP((int32_t) t);
if (PA_UNLIKELY(++channel >= channels))
channel = 0;
}
if (PA_UNLIKELY(++channel >= channels))
channel = 0;
}
}
static void
pa_volume_s24ne_c (uint8_t *samples, int32_t *volumes, unsigned channels, unsigned length)
{
unsigned channel;
uint8_t *e;
unsigned channel;
uint8_t *e;
e = samples + length;
e = samples + length;
for (channel = 0; samples < e; samples += 3) {
int64_t t;
for (channel = 0; samples < e; samples += 3) {
int64_t t;
t = (int64_t)((int32_t) (PA_READ24NE(samples) << 8));
t = (t * volumes[channel]) >> 16;
t = PA_CLAMP_UNLIKELY(t, -0x80000000LL, 0x7FFFFFFFLL);
PA_WRITE24NE(samples, ((uint32_t) (int32_t) t) >> 8);
t = (int64_t)((int32_t) (PA_READ24NE(samples) << 8));
t = (t * volumes[channel]) >> 16;
t = PA_CLAMP_UNLIKELY(t, -0x80000000LL, 0x7FFFFFFFLL);
PA_WRITE24NE(samples, ((uint32_t) (int32_t) t) >> 8);
if (PA_UNLIKELY(++channel >= channels))
channel = 0;
}
if (PA_UNLIKELY(++channel >= channels))
channel = 0;
}
}
static void
pa_volume_s24re_c (uint8_t *samples, int32_t *volumes, unsigned channels, unsigned length)
{
unsigned channel;
uint8_t *e;
unsigned channel;
uint8_t *e;
e = samples + length;
e = samples + length;
for (channel = 0; samples < e; samples += 3) {
int64_t t;
for (channel = 0; samples < e; samples += 3) {
int64_t t;
t = (int64_t)((int32_t) (PA_READ24RE(samples) << 8));
t = (t * volumes[channel]) >> 16;
t = PA_CLAMP_UNLIKELY(t, -0x80000000LL, 0x7FFFFFFFLL);
PA_WRITE24RE(samples, ((uint32_t) (int32_t) t) >> 8);
t = (int64_t)((int32_t) (PA_READ24RE(samples) << 8));
t = (t * volumes[channel]) >> 16;
t = PA_CLAMP_UNLIKELY(t, -0x80000000LL, 0x7FFFFFFFLL);
PA_WRITE24RE(samples, ((uint32_t) (int32_t) t) >> 8);
if (PA_UNLIKELY(++channel >= channels))
channel = 0;
}
if (PA_UNLIKELY(++channel >= channels))
channel = 0;
}
}
static void
pa_volume_s24_32ne_c (uint32_t *samples, int32_t *volumes, unsigned channels, unsigned length)
{
unsigned channel;
unsigned channel;
length /= sizeof (uint32_t);
length /= sizeof (uint32_t);
for (channel = 0; length; length--) {
int64_t t;
for (channel = 0; length; length--) {
int64_t t;
t = (int64_t) ((int32_t) (*samples << 8));
t = (t * volumes[channel]) >> 16;
t = PA_CLAMP_UNLIKELY(t, -0x80000000LL, 0x7FFFFFFFLL);
*samples++ = ((uint32_t) ((int32_t) t)) >> 8;
t = (int64_t) ((int32_t) (*samples << 8));
t = (t * volumes[channel]) >> 16;
t = PA_CLAMP_UNLIKELY(t, -0x80000000LL, 0x7FFFFFFFLL);
*samples++ = ((uint32_t) ((int32_t) t)) >> 8;
if (PA_UNLIKELY(++channel >= channels))
channel = 0;
}
if (PA_UNLIKELY(++channel >= channels))
channel = 0;
}
}
static void
pa_volume_s24_32re_c (uint32_t *samples, int32_t *volumes, unsigned channels, unsigned length)
{
unsigned channel;
unsigned channel;
length /= sizeof (uint32_t);
length /= sizeof (uint32_t);
for (channel = 0; length; length--) {
int64_t t;
for (channel = 0; length; length--) {
int64_t t;
t = (int64_t) ((int32_t) (PA_UINT32_SWAP(*samples) << 8));
t = (t * volumes[channel]) >> 16;
t = PA_CLAMP_UNLIKELY(t, -0x80000000LL, 0x7FFFFFFFLL);
*samples++ = PA_UINT32_SWAP(((uint32_t) ((int32_t) t)) >> 8);
t = (int64_t) ((int32_t) (PA_UINT32_SWAP(*samples) << 8));
t = (t * volumes[channel]) >> 16;
t = PA_CLAMP_UNLIKELY(t, -0x80000000LL, 0x7FFFFFFFLL);
*samples++ = PA_UINT32_SWAP(((uint32_t) ((int32_t) t)) >> 8);
if (PA_UNLIKELY(++channel >= channels))
channel = 0;
}
if (PA_UNLIKELY(++channel >= channels))
channel = 0;
}
}
static pa_do_volume_func_t do_volume_table[] =
{
[PA_SAMPLE_U8] = (pa_do_volume_func_t) pa_volume_u8_c,
[PA_SAMPLE_ALAW] = (pa_do_volume_func_t) pa_volume_alaw_c,
[PA_SAMPLE_ULAW] = (pa_do_volume_func_t) pa_volume_ulaw_c,
[PA_SAMPLE_S16NE] = (pa_do_volume_func_t) pa_volume_s16ne_c,
[PA_SAMPLE_S16RE] = (pa_do_volume_func_t) pa_volume_s16re_c,
[PA_SAMPLE_FLOAT32NE] = (pa_do_volume_func_t) pa_volume_float32ne_c,
[PA_SAMPLE_FLOAT32RE] = (pa_do_volume_func_t) pa_volume_float32re_c,
[PA_SAMPLE_S32NE] = (pa_do_volume_func_t) pa_volume_s32ne_c,
[PA_SAMPLE_S32RE] = (pa_do_volume_func_t) pa_volume_s32re_c,
[PA_SAMPLE_S24NE] = (pa_do_volume_func_t) pa_volume_s24ne_c,
[PA_SAMPLE_S24RE] = (pa_do_volume_func_t) pa_volume_s24re_c,
[PA_SAMPLE_S24_32NE] = (pa_do_volume_func_t) pa_volume_s24_32ne_c,
[PA_SAMPLE_S24_32RE] = (pa_do_volume_func_t) pa_volume_s24_32re_c
[PA_SAMPLE_U8] = (pa_do_volume_func_t) pa_volume_u8_c,
[PA_SAMPLE_ALAW] = (pa_do_volume_func_t) pa_volume_alaw_c,
[PA_SAMPLE_ULAW] = (pa_do_volume_func_t) pa_volume_ulaw_c,
[PA_SAMPLE_S16NE] = (pa_do_volume_func_t) pa_volume_s16ne_c,
[PA_SAMPLE_S16RE] = (pa_do_volume_func_t) pa_volume_s16re_c,
[PA_SAMPLE_FLOAT32NE] = (pa_do_volume_func_t) pa_volume_float32ne_c,
[PA_SAMPLE_FLOAT32RE] = (pa_do_volume_func_t) pa_volume_float32re_c,
[PA_SAMPLE_S32NE] = (pa_do_volume_func_t) pa_volume_s32ne_c,
[PA_SAMPLE_S32RE] = (pa_do_volume_func_t) pa_volume_s32re_c,
[PA_SAMPLE_S24NE] = (pa_do_volume_func_t) pa_volume_s24ne_c,
[PA_SAMPLE_S24RE] = (pa_do_volume_func_t) pa_volume_s24re_c,
[PA_SAMPLE_S24_32NE] = (pa_do_volume_func_t) pa_volume_s24_32ne_c,
[PA_SAMPLE_S24_32RE] = (pa_do_volume_func_t) pa_volume_s24_32re_c
};
pa_do_volume_func_t pa_get_volume_func(pa_sample_format_t f) {

View file

@ -96,147 +96,147 @@
static void
pa_volume_s16ne_mmx (int16_t *samples, int32_t *volumes, unsigned channels, unsigned length)
{
pa_reg_x86 channel, temp;
pa_reg_x86 channel, temp;
/* the max number of samples we process at a time, this is also the max amount
* we overread the volume array, which should have enough padding. */
channels = MAX (4, channels);
/* the max number of samples we process at a time, this is also the max amount
* we overread the volume array, which should have enough padding. */
channels = MAX (4, channels);
__asm__ __volatile__ (
" xor %3, %3 \n\t"
" sar $1, %2 \n\t" /* length /= sizeof (int16_t) */
" pcmpeqw %%mm6, %%mm6 \n\t" /* .. | ffff | ffff | */
" pcmpeqw %%mm7, %%mm7 \n\t" /* .. | ffff | ffff | */
" pslld $16, %%mm6 \n\t" /* .. | ffff | 0 | */
" psrld $31, %%mm7 \n\t" /* .. | 0 | 1 | */
__asm__ __volatile__ (
" xor %3, %3 \n\t"
" sar $1, %2 \n\t" /* length /= sizeof (int16_t) */
" pcmpeqw %%mm6, %%mm6 \n\t" /* .. | ffff | ffff | */
" pcmpeqw %%mm7, %%mm7 \n\t" /* .. | ffff | ffff | */
" pslld $16, %%mm6 \n\t" /* .. | ffff | 0 | */
" psrld $31, %%mm7 \n\t" /* .. | 0 | 1 | */
" test $1, %2 \n\t" /* check for odd samples */
" je 2f \n\t"
" test $1, %2 \n\t" /* check for odd samples */
" je 2f \n\t"
" movd (%1, %3, 4), %%mm0 \n\t" /* | v0h | v0l | */
" movw (%0), %w4 \n\t" /* .. | p0 | */
" movd %4, %%mm1 \n\t"
VOLUME_32x16 (%%mm1, %%mm0)
" movd %%mm0, %4 \n\t" /* .. | p0*v0 | */
" movw %w4, (%0) \n\t"
" add $2, %0 \n\t"
MOD_ADD ($1, %5)
" movd (%1, %3, 4), %%mm0 \n\t" /* | v0h | v0l | */
" movw (%0), %w4 \n\t" /* .. | p0 | */
" movd %4, %%mm1 \n\t"
VOLUME_32x16 (%%mm1, %%mm0)
" movd %%mm0, %4 \n\t" /* .. | p0*v0 | */
" movw %w4, (%0) \n\t"
" add $2, %0 \n\t"
MOD_ADD ($1, %5)
"2: \n\t"
" sar $1, %2 \n\t" /* prepare for processing 2 samples at a time */
" test $1, %2 \n\t" /* check for odd samples */
" je 4f \n\t"
"2: \n\t"
" sar $1, %2 \n\t" /* prepare for processing 2 samples at a time */
" test $1, %2 \n\t" /* check for odd samples */
" je 4f \n\t"
"3: \n\t" /* do samples in groups of 2 */
" movq (%1, %3, 4), %%mm0 \n\t" /* | v1h | v1l | v0h | v0l | */
" movd (%0), %%mm1 \n\t" /* .. | p1 | p0 | */
VOLUME_32x16 (%%mm1, %%mm0)
" movd %%mm0, (%0) \n\t" /* .. | p1*v1 | p0*v0 | */
" add $4, %0 \n\t"
MOD_ADD ($2, %5)
"3: \n\t" /* do samples in groups of 2 */
" movq (%1, %3, 4), %%mm0 \n\t" /* | v1h | v1l | v0h | v0l | */
" movd (%0), %%mm1 \n\t" /* .. | p1 | p0 | */
VOLUME_32x16 (%%mm1, %%mm0)
" movd %%mm0, (%0) \n\t" /* .. | p1*v1 | p0*v0 | */
" add $4, %0 \n\t"
MOD_ADD ($2, %5)
"4: \n\t"
" sar $1, %2 \n\t" /* prepare for processing 4 samples at a time */
" cmp $0, %2 \n\t"
" je 6f \n\t"
"4: \n\t"
" sar $1, %2 \n\t" /* prepare for processing 4 samples at a time */
" cmp $0, %2 \n\t"
" je 6f \n\t"
"5: \n\t" /* do samples in groups of 4 */
" movq (%1, %3, 4), %%mm0 \n\t" /* | v1h | v1l | v0h | v0l | */
" movq 8(%1, %3, 4), %%mm2 \n\t" /* | v3h | v3l | v2h | v2l | */
" movd (%0), %%mm1 \n\t" /* .. | p1 | p0 | */
" movd 4(%0), %%mm3 \n\t" /* .. | p3 | p2 | */
VOLUME_32x16 (%%mm1, %%mm0)
VOLUME_32x16 (%%mm3, %%mm2)
" movd %%mm0, (%0) \n\t" /* .. | p1*v1 | p0*v0 | */
" movd %%mm2, 4(%0) \n\t" /* .. | p3*v3 | p2*v2 | */
" add $8, %0 \n\t"
MOD_ADD ($4, %5)
" dec %2 \n\t"
" jne 5b \n\t"
"5: \n\t" /* do samples in groups of 4 */
" movq (%1, %3, 4), %%mm0 \n\t" /* | v1h | v1l | v0h | v0l | */
" movq 8(%1, %3, 4), %%mm2 \n\t" /* | v3h | v3l | v2h | v2l | */
" movd (%0), %%mm1 \n\t" /* .. | p1 | p0 | */
" movd 4(%0), %%mm3 \n\t" /* .. | p3 | p2 | */
VOLUME_32x16 (%%mm1, %%mm0)
VOLUME_32x16 (%%mm3, %%mm2)
" movd %%mm0, (%0) \n\t" /* .. | p1*v1 | p0*v0 | */
" movd %%mm2, 4(%0) \n\t" /* .. | p3*v3 | p2*v2 | */
" add $8, %0 \n\t"
MOD_ADD ($4, %5)
" dec %2 \n\t"
" jne 5b \n\t"
"6: \n\t"
" emms \n\t"
"6: \n\t"
" emms \n\t"
: "+r" (samples), "+r" (volumes), "+r" (length), "=D" ((pa_reg_x86)channel), "=&r" (temp)
: "r" ((pa_reg_x86)channels)
: "cc"
);
: "+r" (samples), "+r" (volumes), "+r" (length), "=D" ((pa_reg_x86)channel), "=&r" (temp)
: "r" ((pa_reg_x86)channels)
: "cc"
);
}
static void
pa_volume_s16re_mmx (int16_t *samples, int32_t *volumes, unsigned channels, unsigned length)
{
pa_reg_x86 channel, temp;
pa_reg_x86 channel, temp;
/* the max number of samples we process at a time, this is also the max amount
* we overread the volume array, which should have enough padding. */
channels = MAX (4, channels);
/* the max number of samples we process at a time, this is also the max amount
* we overread the volume array, which should have enough padding. */
channels = MAX (4, channels);
__asm__ __volatile__ (
" xor %3, %3 \n\t"
" sar $1, %2 \n\t" /* length /= sizeof (int16_t) */
" pcmpeqw %%mm6, %%mm6 \n\t" /* .. | ffff | ffff | */
" pcmpeqw %%mm7, %%mm7 \n\t" /* .. | ffff | ffff | */
" pslld $16, %%mm6 \n\t" /* .. | ffff | 0 | */
" psrld $31, %%mm7 \n\t" /* .. | 0 | 1 | */
__asm__ __volatile__ (
" xor %3, %3 \n\t"
" sar $1, %2 \n\t" /* length /= sizeof (int16_t) */
" pcmpeqw %%mm6, %%mm6 \n\t" /* .. | ffff | ffff | */
" pcmpeqw %%mm7, %%mm7 \n\t" /* .. | ffff | ffff | */
" pslld $16, %%mm6 \n\t" /* .. | ffff | 0 | */
" psrld $31, %%mm7 \n\t" /* .. | 0 | 1 | */
" test $1, %2 \n\t" /* check for odd samples */
" je 2f \n\t"
" test $1, %2 \n\t" /* check for odd samples */
" je 2f \n\t"
" movd (%1, %3, 4), %%mm0 \n\t" /* | v0h | v0l | */
" movw (%0), %w4 \n\t" /* .. | p0 | */
" rorw $8, %w4 \n\t"
" movd %4, %%mm1 \n\t"
VOLUME_32x16 (%%mm1, %%mm0)
" movd %%mm0, %4 \n\t" /* .. | p0*v0 | */
" rorw $8, %w4 \n\t"
" movw %w4, (%0) \n\t"
" add $2, %0 \n\t"
MOD_ADD ($1, %5)
" movd (%1, %3, 4), %%mm0 \n\t" /* | v0h | v0l | */
" movw (%0), %w4 \n\t" /* .. | p0 | */
" rorw $8, %w4 \n\t"
" movd %4, %%mm1 \n\t"
VOLUME_32x16 (%%mm1, %%mm0)
" movd %%mm0, %4 \n\t" /* .. | p0*v0 | */
" rorw $8, %w4 \n\t"
" movw %w4, (%0) \n\t"
" add $2, %0 \n\t"
MOD_ADD ($1, %5)
"2: \n\t"
" sar $1, %2 \n\t" /* prepare for processing 2 samples at a time */
" test $1, %2 \n\t" /* check for odd samples */
" je 4f \n\t"
"2: \n\t"
" sar $1, %2 \n\t" /* prepare for processing 2 samples at a time */
" test $1, %2 \n\t" /* check for odd samples */
" je 4f \n\t"
"3: \n\t" /* do samples in groups of 2 */
" movq (%1, %3, 4), %%mm0 \n\t" /* | v1h | v1l | v0h | v0l | */
" movd (%0), %%mm1 \n\t" /* .. | p1 | p0 | */
SWAP_16 (%%mm1)
VOLUME_32x16 (%%mm1, %%mm0)
SWAP_16 (%%mm0)
" movd %%mm0, (%0) \n\t" /* .. | p1*v1 | p0*v0 | */
" add $4, %0 \n\t"
MOD_ADD ($2, %5)
"3: \n\t" /* do samples in groups of 2 */
" movq (%1, %3, 4), %%mm0 \n\t" /* | v1h | v1l | v0h | v0l | */
" movd (%0), %%mm1 \n\t" /* .. | p1 | p0 | */
SWAP_16 (%%mm1)
VOLUME_32x16 (%%mm1, %%mm0)
SWAP_16 (%%mm0)
" movd %%mm0, (%0) \n\t" /* .. | p1*v1 | p0*v0 | */
" add $4, %0 \n\t"
MOD_ADD ($2, %5)
"4: \n\t"
" sar $1, %2 \n\t" /* prepare for processing 4 samples at a time */
" cmp $0, %2 \n\t"
" je 6f \n\t"
"4: \n\t"
" sar $1, %2 \n\t" /* prepare for processing 4 samples at a time */
" cmp $0, %2 \n\t"
" je 6f \n\t"
"5: \n\t" /* do samples in groups of 4 */
" movq (%1, %3, 4), %%mm0 \n\t" /* | v1h | v1l | v0h | v0l | */
" movq 8(%1, %3, 4), %%mm2 \n\t" /* | v3h | v3l | v2h | v2l | */
" movd (%0), %%mm1 \n\t" /* .. | p1 | p0 | */
" movd 4(%0), %%mm3 \n\t" /* .. | p3 | p2 | */
SWAP_16_2 (%%mm1, %%mm3)
VOLUME_32x16 (%%mm1, %%mm0)
VOLUME_32x16 (%%mm3, %%mm2)
SWAP_16_2 (%%mm0, %%mm2)
" movd %%mm0, (%0) \n\t" /* .. | p1*v1 | p0*v0 | */
" movd %%mm2, 4(%0) \n\t" /* .. | p3*v3 | p2*v2 | */
" add $8, %0 \n\t"
MOD_ADD ($4, %5)
" dec %2 \n\t"
" jne 5b \n\t"
"5: \n\t" /* do samples in groups of 4 */
" movq (%1, %3, 4), %%mm0 \n\t" /* | v1h | v1l | v0h | v0l | */
" movq 8(%1, %3, 4), %%mm2 \n\t" /* | v3h | v3l | v2h | v2l | */
" movd (%0), %%mm1 \n\t" /* .. | p1 | p0 | */
" movd 4(%0), %%mm3 \n\t" /* .. | p3 | p2 | */
SWAP_16_2 (%%mm1, %%mm3)
VOLUME_32x16 (%%mm1, %%mm0)
VOLUME_32x16 (%%mm3, %%mm2)
SWAP_16_2 (%%mm0, %%mm2)
" movd %%mm0, (%0) \n\t" /* .. | p1*v1 | p0*v0 | */
" movd %%mm2, 4(%0) \n\t" /* .. | p3*v3 | p2*v2 | */
" add $8, %0 \n\t"
MOD_ADD ($4, %5)
" dec %2 \n\t"
" jne 5b \n\t"
"6: \n\t"
" emms \n\t"
"6: \n\t"
" emms \n\t"
: "+r" (samples), "+r" (volumes), "+r" (length), "=D" ((pa_reg_x86)channel), "=&r" (temp)
: "r" ((pa_reg_x86)channels)
: "cc"
);
: "+r" (samples), "+r" (volumes), "+r" (length), "=D" ((pa_reg_x86)channel), "=&r" (temp)
: "r" ((pa_reg_x86)channels)
: "cc"
);
}
#undef RUN_TEST
@ -248,51 +248,51 @@ pa_volume_s16re_mmx (int16_t *samples, int32_t *volumes, unsigned channels, unsi
#define PADDING 16
static void run_test (void) {
int16_t samples[SAMPLES];
int16_t samples_ref[SAMPLES];
int16_t samples_orig[SAMPLES];
int32_t volumes[CHANNELS + PADDING];
int i, j, padding;
pa_do_volume_func_t func;
struct timeval start, stop;
int16_t samples[SAMPLES];
int16_t samples_ref[SAMPLES];
int16_t samples_orig[SAMPLES];
int32_t volumes[CHANNELS + PADDING];
int i, j, padding;
pa_do_volume_func_t func;
struct timeval start, stop;
func = pa_get_volume_func (PA_SAMPLE_S16NE);
func = pa_get_volume_func (PA_SAMPLE_S16NE);
printf ("checking MMX %zd\n", sizeof (samples));
printf ("checking MMX %zd\n", sizeof (samples));
pa_random (samples, sizeof (samples));
memcpy (samples_ref, samples, sizeof (samples));
memcpy (samples_orig, samples, sizeof (samples));
pa_random (samples, sizeof (samples));
memcpy (samples_ref, samples, sizeof (samples));
memcpy (samples_orig, samples, sizeof (samples));
for (i = 0; i < CHANNELS; i++)
volumes[i] = rand() >> 1;
for (padding = 0; padding < PADDING; padding++, i++)
volumes[i] = volumes[padding];
for (i = 0; i < CHANNELS; i++)
volumes[i] = rand() >> 1;
for (padding = 0; padding < PADDING; padding++, i++)
volumes[i] = volumes[padding];
func (samples_ref, volumes, CHANNELS, sizeof (samples));
pa_volume_s16ne_mmx (samples, volumes, CHANNELS, sizeof (samples));
for (i = 0; i < SAMPLES; i++) {
if (samples[i] != samples_ref[i]) {
printf ("%d: %04x != %04x (%04x * %04x)\n", i, samples[i], samples_ref[i],
samples_orig[i], volumes[i % CHANNELS]);
}
}
pa_gettimeofday(&start);
for (j = 0; j < TIMES; j++) {
memcpy (samples, samples_orig, sizeof (samples));
pa_volume_s16ne_mmx (samples, volumes, CHANNELS, sizeof (samples));
}
pa_gettimeofday(&stop);
pa_log_info("MMX: %llu usec.", (long long unsigned int)pa_timeval_diff (&stop, &start));
pa_gettimeofday(&start);
for (j = 0; j < TIMES; j++) {
memcpy (samples_ref, samples_orig, sizeof (samples));
func (samples_ref, volumes, CHANNELS, sizeof (samples));
}
pa_gettimeofday(&stop);
pa_log_info("ref: %llu usec.", (long long unsigned int)pa_timeval_diff (&stop, &start));
pa_volume_s16ne_mmx (samples, volumes, CHANNELS, sizeof (samples));
for (i = 0; i < SAMPLES; i++) {
if (samples[i] != samples_ref[i]) {
printf ("%d: %04x != %04x (%04x * %04x)\n", i, samples[i], samples_ref[i],
samples_orig[i], volumes[i % CHANNELS]);
}
}
pa_gettimeofday(&start);
for (j = 0; j < TIMES; j++) {
memcpy (samples, samples_orig, sizeof (samples));
pa_volume_s16ne_mmx (samples, volumes, CHANNELS, sizeof (samples));
}
pa_gettimeofday(&stop);
pa_log_info("MMX: %llu usec.", (long long unsigned int)pa_timeval_diff (&stop, &start));
pa_gettimeofday(&start);
for (j = 0; j < TIMES; j++) {
memcpy (samples_ref, samples_orig, sizeof (samples));
func (samples_ref, volumes, CHANNELS, sizeof (samples));
}
pa_gettimeofday(&stop);
pa_log_info("ref: %llu usec.", (long long unsigned int)pa_timeval_diff (&stop, &start));
}
#endif
@ -301,13 +301,13 @@ static void run_test (void) {
void pa_volume_func_init_mmx (pa_cpu_x86_flag_t flags) {
#if defined (__i386__) || defined (__amd64__)
pa_log_info("Initialising MMX optimized functions.");
pa_log_info("Initialising MMX optimized functions.");
#ifdef RUN_TEST
run_test ();
run_test ();
#endif
pa_set_volume_func (PA_SAMPLE_S16NE, (pa_do_volume_func_t) pa_volume_s16ne_mmx);
pa_set_volume_func (PA_SAMPLE_S16RE, (pa_do_volume_func_t) pa_volume_s16re_mmx);
pa_set_volume_func (PA_SAMPLE_S16NE, (pa_do_volume_func_t) pa_volume_s16ne_mmx);
pa_set_volume_func (PA_SAMPLE_S16RE, (pa_do_volume_func_t) pa_volume_s16re_mmx);
#endif /* defined (__i386__) || defined (__amd64__) */
}

View file

@ -77,169 +77,169 @@
static void
pa_volume_s16ne_sse (int16_t *samples, int32_t *volumes, unsigned channels, unsigned length)
{
pa_reg_x86 channel, temp;
pa_reg_x86 channel, temp;
/* the max number of samples we process at a time, this is also the max amount
* we overread the volume array, which should have enough padding. */
channels = MAX (8, channels);
/* the max number of samples we process at a time, this is also the max amount
* we overread the volume array, which should have enough padding. */
channels = MAX (8, channels);
__asm__ __volatile__ (
" xor %3, %3 \n\t"
" sar $1, %2 \n\t" /* length /= sizeof (int16_t) */
__asm__ __volatile__ (
" xor %3, %3 \n\t"
" sar $1, %2 \n\t" /* length /= sizeof (int16_t) */
" test $1, %2 \n\t" /* check for odd samples */
" je 2f \n\t"
" test $1, %2 \n\t" /* check for odd samples */
" je 2f \n\t"
" movd (%1, %3, 4), %%xmm0 \n\t" /* | v0h | v0l | */
" movw (%0), %w4 \n\t" /* .. | p0 | */
" movd %4, %%xmm1 \n\t"
VOLUME_32x16 (%%xmm1, %%xmm0)
" movd %%xmm0, %4 \n\t" /* .. | p0*v0 | */
" movw %w4, (%0) \n\t"
" add $2, %0 \n\t"
MOD_ADD ($1, %5)
" movd (%1, %3, 4), %%xmm0 \n\t" /* | v0h | v0l | */
" movw (%0), %w4 \n\t" /* .. | p0 | */
" movd %4, %%xmm1 \n\t"
VOLUME_32x16 (%%xmm1, %%xmm0)
" movd %%xmm0, %4 \n\t" /* .. | p0*v0 | */
" movw %w4, (%0) \n\t"
" add $2, %0 \n\t"
MOD_ADD ($1, %5)
"2: \n\t"
" sar $1, %2 \n\t" /* prepare for processing 2 samples at a time */
" test $1, %2 \n\t"
" je 4f \n\t"
"2: \n\t"
" sar $1, %2 \n\t" /* prepare for processing 2 samples at a time */
" test $1, %2 \n\t"
" je 4f \n\t"
"3: \n\t" /* do samples in groups of 2 */
" movq (%1, %3, 4), %%xmm0 \n\t" /* | v1h | v1l | v0h | v0l | */
" movd (%0), %%xmm1 \n\t" /* .. | p1 | p0 | */
VOLUME_32x16 (%%xmm1, %%xmm0)
" movd %%xmm0, (%0) \n\t" /* .. | p1*v1 | p0*v0 | */
" add $4, %0 \n\t"
MOD_ADD ($2, %5)
"3: \n\t" /* do samples in groups of 2 */
" movq (%1, %3, 4), %%xmm0 \n\t" /* | v1h | v1l | v0h | v0l | */
" movd (%0), %%xmm1 \n\t" /* .. | p1 | p0 | */
VOLUME_32x16 (%%xmm1, %%xmm0)
" movd %%xmm0, (%0) \n\t" /* .. | p1*v1 | p0*v0 | */
" add $4, %0 \n\t"
MOD_ADD ($2, %5)
"4: \n\t"
" sar $1, %2 \n\t" /* prepare for processing 4 samples at a time */
" test $1, %2 \n\t"
" je 6f \n\t"
"4: \n\t"
" sar $1, %2 \n\t" /* prepare for processing 4 samples at a time */
" test $1, %2 \n\t"
" je 6f \n\t"
/* FIXME, we can do aligned access of the volume values if we can guarantee
* that the array is 16 bytes aligned, we probably have to do the odd values
* after this then. */
"5: \n\t" /* do samples in groups of 4 */
" movdqu (%1, %3, 4), %%xmm0 \n\t" /* | v3h | v3l .. v0h | v0l | */
" movq (%0), %%xmm1 \n\t" /* .. | p3 .. p0 | */
VOLUME_32x16 (%%xmm1, %%xmm0)
" movq %%xmm0, (%0) \n\t" /* .. | p3*v3 .. p0*v0 | */
" add $8, %0 \n\t"
MOD_ADD ($4, %5)
/* FIXME, we can do aligned access of the volume values if we can guarantee
* that the array is 16 bytes aligned, we probably have to do the odd values
* after this then. */
"5: \n\t" /* do samples in groups of 4 */
" movdqu (%1, %3, 4), %%xmm0 \n\t" /* | v3h | v3l .. v0h | v0l | */
" movq (%0), %%xmm1 \n\t" /* .. | p3 .. p0 | */
VOLUME_32x16 (%%xmm1, %%xmm0)
" movq %%xmm0, (%0) \n\t" /* .. | p3*v3 .. p0*v0 | */
" add $8, %0 \n\t"
MOD_ADD ($4, %5)
"6: \n\t"
" sar $1, %2 \n\t" /* prepare for processing 8 samples at a time */
" cmp $0, %2 \n\t"
" je 8f \n\t"
"6: \n\t"
" sar $1, %2 \n\t" /* prepare for processing 8 samples at a time */
" cmp $0, %2 \n\t"
" je 8f \n\t"
"7: \n\t" /* do samples in groups of 8 */
" movdqu (%1, %3, 4), %%xmm0 \n\t" /* | v3h | v3l .. v0h | v0l | */
" movdqu 16(%1, %3, 4), %%xmm2 \n\t" /* | v7h | v7l .. v4h | v4l | */
" movq (%0), %%xmm1 \n\t" /* .. | p3 .. p0 | */
" movq 8(%0), %%xmm3 \n\t" /* .. | p7 .. p4 | */
VOLUME_32x16 (%%xmm1, %%xmm0)
VOLUME_32x16 (%%xmm3, %%xmm2)
" movq %%xmm0, (%0) \n\t" /* .. | p3*v3 .. p0*v0 | */
" movq %%xmm2, 8(%0) \n\t" /* .. | p7*v7 .. p4*v4 | */
" add $16, %0 \n\t"
MOD_ADD ($8, %5)
" dec %2 \n\t"
" jne 7b \n\t"
"8: \n\t"
"7: \n\t" /* do samples in groups of 8 */
" movdqu (%1, %3, 4), %%xmm0 \n\t" /* | v3h | v3l .. v0h | v0l | */
" movdqu 16(%1, %3, 4), %%xmm2 \n\t" /* | v7h | v7l .. v4h | v4l | */
" movq (%0), %%xmm1 \n\t" /* .. | p3 .. p0 | */
" movq 8(%0), %%xmm3 \n\t" /* .. | p7 .. p4 | */
VOLUME_32x16 (%%xmm1, %%xmm0)
VOLUME_32x16 (%%xmm3, %%xmm2)
" movq %%xmm0, (%0) \n\t" /* .. | p3*v3 .. p0*v0 | */
" movq %%xmm2, 8(%0) \n\t" /* .. | p7*v7 .. p4*v4 | */
" add $16, %0 \n\t"
MOD_ADD ($8, %5)
" dec %2 \n\t"
" jne 7b \n\t"
"8: \n\t"
: "+r" (samples), "+r" (volumes), "+r" (length), "=D" (channel), "=&r" (temp)
: "r" ((pa_reg_x86)channels)
: "cc"
);
: "+r" (samples), "+r" (volumes), "+r" (length), "=D" (channel), "=&r" (temp)
: "r" ((pa_reg_x86)channels)
: "cc"
);
}
static void
pa_volume_s16re_sse (int16_t *samples, int32_t *volumes, unsigned channels, unsigned length)
{
pa_reg_x86 channel, temp;
pa_reg_x86 channel, temp;
/* the max number of samples we process at a time, this is also the max amount
* we overread the volume array, which should have enough padding. */
channels = MAX (8, channels);
/* the max number of samples we process at a time, this is also the max amount
* we overread the volume array, which should have enough padding. */
channels = MAX (8, channels);
__asm__ __volatile__ (
" xor %3, %3 \n\t"
" sar $1, %2 \n\t" /* length /= sizeof (int16_t) */
__asm__ __volatile__ (
" xor %3, %3 \n\t"
" sar $1, %2 \n\t" /* length /= sizeof (int16_t) */
" test $1, %2 \n\t" /* check for odd samples */
" je 2f \n\t"
" test $1, %2 \n\t" /* check for odd samples */
" je 2f \n\t"
" movd (%1, %3, 4), %%xmm0 \n\t" /* | v0h | v0l | */
" movw (%0), %w4 \n\t" /* .. | p0 | */
" rorw $8, %w4 \n\t"
" movd %4, %%xmm1 \n\t"
VOLUME_32x16 (%%xmm1, %%xmm0)
" movd %%xmm0, %4 \n\t" /* .. | p0*v0 | */
" rorw $8, %w4 \n\t"
" movw %w4, (%0) \n\t"
" add $2, %0 \n\t"
MOD_ADD ($1, %5)
" movd (%1, %3, 4), %%xmm0 \n\t" /* | v0h | v0l | */
" movw (%0), %w4 \n\t" /* .. | p0 | */
" rorw $8, %w4 \n\t"
" movd %4, %%xmm1 \n\t"
VOLUME_32x16 (%%xmm1, %%xmm0)
" movd %%xmm0, %4 \n\t" /* .. | p0*v0 | */
" rorw $8, %w4 \n\t"
" movw %w4, (%0) \n\t"
" add $2, %0 \n\t"
MOD_ADD ($1, %5)
"2: \n\t"
" sar $1, %2 \n\t" /* prepare for processing 2 samples at a time */
" test $1, %2 \n\t"
" je 4f \n\t"
"2: \n\t"
" sar $1, %2 \n\t" /* prepare for processing 2 samples at a time */
" test $1, %2 \n\t"
" je 4f \n\t"
"3: \n\t" /* do samples in groups of 2 */
" movq (%1, %3, 4), %%xmm0 \n\t" /* | v1h | v1l | v0h | v0l | */
" movd (%0), %%xmm1 \n\t" /* .. | p1 | p0 | */
SWAP_16 (%%xmm1)
VOLUME_32x16 (%%xmm1, %%xmm0)
SWAP_16 (%%xmm0)
" movd %%xmm0, (%0) \n\t" /* .. | p1*v1 | p0*v0 | */
" add $4, %0 \n\t"
MOD_ADD ($2, %5)
"3: \n\t" /* do samples in groups of 2 */
" movq (%1, %3, 4), %%xmm0 \n\t" /* | v1h | v1l | v0h | v0l | */
" movd (%0), %%xmm1 \n\t" /* .. | p1 | p0 | */
SWAP_16 (%%xmm1)
VOLUME_32x16 (%%xmm1, %%xmm0)
SWAP_16 (%%xmm0)
" movd %%xmm0, (%0) \n\t" /* .. | p1*v1 | p0*v0 | */
" add $4, %0 \n\t"
MOD_ADD ($2, %5)
"4: \n\t"
" sar $1, %2 \n\t" /* prepare for processing 4 samples at a time */
" test $1, %2 \n\t"
" je 6f \n\t"
"4: \n\t"
" sar $1, %2 \n\t" /* prepare for processing 4 samples at a time */
" test $1, %2 \n\t"
" je 6f \n\t"
/* FIXME, we can do aligned access of the volume values if we can guarantee
* that the array is 16 bytes aligned, we probably have to do the odd values
* after this then. */
"5: \n\t" /* do samples in groups of 4 */
" movdqu (%1, %3, 4), %%xmm0 \n\t" /* | v3h | v3l .. v0h | v0l | */
" movq (%0), %%xmm1 \n\t" /* .. | p3 .. p0 | */
SWAP_16 (%%xmm1)
VOLUME_32x16 (%%xmm1, %%xmm0)
SWAP_16 (%%xmm0)
" movq %%xmm0, (%0) \n\t" /* .. | p3*v3 .. p0*v0 | */
" add $8, %0 \n\t"
MOD_ADD ($4, %5)
/* FIXME, we can do aligned access of the volume values if we can guarantee
* that the array is 16 bytes aligned, we probably have to do the odd values
* after this then. */
"5: \n\t" /* do samples in groups of 4 */
" movdqu (%1, %3, 4), %%xmm0 \n\t" /* | v3h | v3l .. v0h | v0l | */
" movq (%0), %%xmm1 \n\t" /* .. | p3 .. p0 | */
SWAP_16 (%%xmm1)
VOLUME_32x16 (%%xmm1, %%xmm0)
SWAP_16 (%%xmm0)
" movq %%xmm0, (%0) \n\t" /* .. | p3*v3 .. p0*v0 | */
" add $8, %0 \n\t"
MOD_ADD ($4, %5)
"6: \n\t"
" sar $1, %2 \n\t" /* prepare for processing 8 samples at a time */
" cmp $0, %2 \n\t"
" je 8f \n\t"
"6: \n\t"
" sar $1, %2 \n\t" /* prepare for processing 8 samples at a time */
" cmp $0, %2 \n\t"
" je 8f \n\t"
"7: \n\t" /* do samples in groups of 8 */
" movdqu (%1, %3, 4), %%xmm0 \n\t" /* | v3h | v3l .. v0h | v0l | */
" movdqu 16(%1, %3, 4), %%xmm2 \n\t" /* | v7h | v7l .. v4h | v4l | */
" movq (%0), %%xmm1 \n\t" /* .. | p3 .. p0 | */
" movq 8(%0), %%xmm3 \n\t" /* .. | p7 .. p4 | */
SWAP_16_2 (%%xmm1, %%xmm3)
VOLUME_32x16 (%%xmm1, %%xmm0)
VOLUME_32x16 (%%xmm3, %%xmm2)
SWAP_16_2 (%%xmm0, %%xmm2)
" movq %%xmm0, (%0) \n\t" /* .. | p3*v3 .. p0*v0 | */
" movq %%xmm2, 8(%0) \n\t" /* .. | p7*v7 .. p4*v4 | */
" add $16, %0 \n\t"
MOD_ADD ($8, %5)
" dec %2 \n\t"
" jne 7b \n\t"
"8: \n\t"
"7: \n\t" /* do samples in groups of 8 */
" movdqu (%1, %3, 4), %%xmm0 \n\t" /* | v3h | v3l .. v0h | v0l | */
" movdqu 16(%1, %3, 4), %%xmm2 \n\t" /* | v7h | v7l .. v4h | v4l | */
" movq (%0), %%xmm1 \n\t" /* .. | p3 .. p0 | */
" movq 8(%0), %%xmm3 \n\t" /* .. | p7 .. p4 | */
SWAP_16_2 (%%xmm1, %%xmm3)
VOLUME_32x16 (%%xmm1, %%xmm0)
VOLUME_32x16 (%%xmm3, %%xmm2)
SWAP_16_2 (%%xmm0, %%xmm2)
" movq %%xmm0, (%0) \n\t" /* .. | p3*v3 .. p0*v0 | */
" movq %%xmm2, 8(%0) \n\t" /* .. | p7*v7 .. p4*v4 | */
" add $16, %0 \n\t"
MOD_ADD ($8, %5)
" dec %2 \n\t"
" jne 7b \n\t"
"8: \n\t"
: "+r" (samples), "+r" (volumes), "+r" (length), "=D" (channel), "=&r" (temp)
: "r" ((pa_reg_x86)channels)
: "cc"
);
: "+r" (samples), "+r" (volumes), "+r" (length), "=D" (channel), "=&r" (temp)
: "r" ((pa_reg_x86)channels)
: "cc"
);
}
#undef RUN_TEST
@ -251,64 +251,64 @@ pa_volume_s16re_sse (int16_t *samples, int32_t *volumes, unsigned channels, unsi
#define PADDING 16
static void run_test (void) {
int16_t samples[SAMPLES];
int16_t samples_ref[SAMPLES];
int16_t samples_orig[SAMPLES];
int32_t volumes[CHANNELS + PADDING];
int i, j, padding;
pa_do_volume_func_t func;
struct timeval start, stop;
int16_t samples[SAMPLES];
int16_t samples_ref[SAMPLES];
int16_t samples_orig[SAMPLES];
int32_t volumes[CHANNELS + PADDING];
int i, j, padding;
pa_do_volume_func_t func;
struct timeval start, stop;
func = pa_get_volume_func (PA_SAMPLE_S16NE);
func = pa_get_volume_func (PA_SAMPLE_S16NE);
printf ("checking SSE %zd\n", sizeof (samples));
printf ("checking SSE %zd\n", sizeof (samples));
pa_random (samples, sizeof (samples));
memcpy (samples_ref, samples, sizeof (samples));
memcpy (samples_orig, samples, sizeof (samples));
pa_random (samples, sizeof (samples));
memcpy (samples_ref, samples, sizeof (samples));
memcpy (samples_orig, samples, sizeof (samples));
for (i = 0; i < CHANNELS; i++)
volumes[i] = rand() >> 1;
for (padding = 0; padding < PADDING; padding++, i++)
volumes[i] = volumes[padding];
for (i = 0; i < CHANNELS; i++)
volumes[i] = rand() >> 1;
for (padding = 0; padding < PADDING; padding++, i++)
volumes[i] = volumes[padding];
func (samples_ref, volumes, CHANNELS, sizeof (samples));
pa_volume_s16ne_sse (samples, volumes, CHANNELS, sizeof (samples));
for (i = 0; i < SAMPLES; i++) {
if (samples[i] != samples_ref[i]) {
printf ("%d: %04x != %04x (%04x * %04x)\n", i, samples[i], samples_ref[i],
samples_orig[i], volumes[i % CHANNELS]);
}
}
pa_gettimeofday(&start);
for (j = 0; j < TIMES; j++) {
memcpy (samples, samples_orig, sizeof (samples));
pa_volume_s16ne_sse (samples, volumes, CHANNELS, sizeof (samples));
}
pa_gettimeofday(&stop);
pa_log_info("SSE: %llu usec.", (long long unsigned int)pa_timeval_diff (&stop, &start));
pa_gettimeofday(&start);
for (j = 0; j < TIMES; j++) {
memcpy (samples_ref, samples_orig, sizeof (samples));
func (samples_ref, volumes, CHANNELS, sizeof (samples));
}
pa_gettimeofday(&stop);
pa_log_info("ref: %llu usec.", (long long unsigned int)pa_timeval_diff (&stop, &start));
pa_volume_s16ne_sse (samples, volumes, CHANNELS, sizeof (samples));
for (i = 0; i < SAMPLES; i++) {
if (samples[i] != samples_ref[i]) {
printf ("%d: %04x != %04x (%04x * %04x)\n", i, samples[i], samples_ref[i],
samples_orig[i], volumes[i % CHANNELS]);
}
}
pa_gettimeofday(&start);
for (j = 0; j < TIMES; j++) {
memcpy (samples, samples_orig, sizeof (samples));
pa_volume_s16ne_sse (samples, volumes, CHANNELS, sizeof (samples));
}
pa_gettimeofday(&stop);
pa_log_info("SSE: %llu usec.", (long long unsigned int)pa_timeval_diff (&stop, &start));
pa_gettimeofday(&start);
for (j = 0; j < TIMES; j++) {
memcpy (samples_ref, samples_orig, sizeof (samples));
func (samples_ref, volumes, CHANNELS, sizeof (samples));
}
pa_gettimeofday(&stop);
pa_log_info("ref: %llu usec.", (long long unsigned int)pa_timeval_diff (&stop, &start));
}
#endif
#endif /* defined (__i386__) || defined (__amd64__) */
void pa_volume_func_init_sse (pa_cpu_x86_flag_t flags) {
#if defined (__i386__) || defined (__amd64__)
pa_log_info("Initialising SSE optimized functions.");
pa_log_info("Initialising SSE optimized functions.");
#ifdef RUN_TEST
run_test ();
run_test ();
#endif
pa_set_volume_func (PA_SAMPLE_S16NE, (pa_do_volume_func_t) pa_volume_s16ne_sse);
pa_set_volume_func (PA_SAMPLE_S16RE, (pa_do_volume_func_t) pa_volume_s16re_sse);
pa_set_volume_func (PA_SAMPLE_S16NE, (pa_do_volume_func_t) pa_volume_s16ne_sse);
pa_set_volume_func (PA_SAMPLE_S16RE, (pa_do_volume_func_t) pa_volume_s16re_sse);
#endif /* defined (__i386__) || defined (__amd64__) */
}