Submission #1164332
Source Code Expand
#include <unistd.h>
#include <complex.h>
#include <immintrin.h>
#define PI2 6.28318530717958647692
const int k = 18;
const int m = (1 << k);
typedef double complex cmplx;
double __attribute__ ((aligned(32))) Cr[1<<k];
double __attribute__ ((aligned(32))) Ci[1<<k];
cmplx __attribute__ ((aligned(32))) w[1<<(k-1)];
const int mod10[200] = {
0,1,2,3,4,5,6,7,8,9,0,1,2,3,4,5,6,7,8,9,0,1,2,3,4,5,6,7,8,9,0,1,2,3,4,5,6,7,8,9,0,1,2,3,4,5,6,7,8,9,
0,1,2,3,4,5,6,7,8,9,0,1,2,3,4,5,6,7,8,9,0,1,2,3,4,5,6,7,8,9,0,1,2,3,4,5,6,7,8,9,0,1,2,3,4,5,6,7,8,9,
0,1,2,3,4,5,6,7,8,9,0,1,2,3,4,5,6,7,8,9,0,1,2,3,4,5,6,7,8,9,0,1,2,3,4,5,6,7,8,9,0,1,2,3,4,5,6,7,8,9,
0,1,2,3,4,5,6,7,8,9,0,1,2,3,4,5,6,7,8,9,0,1,2,3,4,5,6,7,8,9,0,1,2,3,4,5,6,7,8,9,0,1,2,3,4,5,6,7,8,9
};
const int div10[200] = {
0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,4,4,
5,5,5,5,5,5,5,5,5,5,6,6,6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7,7,7,8,8,8,8,8,8,8,8,8,8,9,9,9,9,9,9,9,9,9,9,
10,10,10,10,10,10,10,10,10,10,11,11,11,11,11,11,11,11,11,11,12,12,12,12,12,12,12,12,12,12,13,13,13,13,13,13,13,13,13,13,
14,14,14,14,14,14,14,14,14,14,15,15,15,15,15,15,15,15,15,15,16,16,16,16,16,16,16,16,16,16,17,17,17,17,17,17,17,17,17,17,
18,18,18,18,18,18,18,18,18,18,19,19,19,19,19,19,19,19,19,19
};
char ibuf[800008];
char *ibufe = ibuf-1;
__attribute__((target("arch=sandybridge")))
void readall(){
read(STDIN_FILENO, ibuf, sizeof(ibuf));
}
__attribute__((target("arch=sandybridge")))
int read_uint(){
int x=0;
while(*(++ibufe) <'0');
do {
x *= 10;
x += *ibufe-'0';
} while(*(++ibufe) >='0');
return x;
}
char buf[1977782];
char *bufe = buf;
__attribute__((target("arch=sandybridge")))
void write_uint(int x){
int i;
static char tmp[13];
if(x==0){
*bufe++ = '0';
*bufe++ = '\n';
return;
}
for(i=0; x; i++){
if(x >= 200){
tmp[i] = x % 10;
x /= 10;
}
else {
tmp[i] = mod10[x];
x = div10[x];
}
}
for(i--; i >= 0; i--){
*bufe++ = '0' + tmp[i];
}
*bufe++ = '\n';
}
__attribute__((target("arch=sandybridge")))
void flush(){
write(STDOUT_FILENO, buf, bufe-buf);
}
__attribute__((target("arch=sandybridge")))
void complex_mul_avx(__m256d *ar, __m256d *ai, __m256d *br, __m256d *bi){
__m256d arbr = _mm256_mul_pd(*ar, *br);
__m256d aibi = _mm256_mul_pd(*ai, *bi);
__m256d arbi = _mm256_mul_pd(*ar, *bi);
__m256d aibr = _mm256_mul_pd(*ai, *br);
*ar = _mm256_sub_pd(arbr, aibi);
*ai = _mm256_add_pd(arbi, aibr);
/*
__m256d aibi = _mm256_mul_pd(*ai, *bi);
__m256d aibr = _mm256_mul_pd(*ai, *br);
*ai = _mm256_fmadd_pd(*ar, *bi, aibr);
*ar = _mm256_fmsub_pd(*ar, *br, aibi);
*/
}
__attribute__((target("arch=sandybridge")))
void complex_mul(double *ar, double *ai, double br, double bi){
double tmpar = *ar * br - *ai * bi;
*ai = *ar * bi + *ai * br;
*ar = tmpar;
}
__attribute__((target("arch=sandybridge")))
void fft_avx(int k, double *xr, double *xi, const cmplx *w){
const int m = 1 << k;
int u = 1;
int v = m/4;
int i, j;
for(int j = 0;j<v; j+=4){
__m256d ar = _mm256_load_pd(xr+j);
__m256d ai = _mm256_load_pd(xi+j);
__m256d br = _mm256_load_pd(xr+j+v);
__m256d bi = _mm256_load_pd(xi+j+v);
_mm256_store_pd(xr+j, _mm256_add_pd(ar, br));
_mm256_store_pd(xi+j, _mm256_add_pd(ai, bi));
_mm256_store_pd(xr+j+v, _mm256_sub_pd(ar, br));
_mm256_store_pd(xi+j+v, _mm256_sub_pd(ai, bi));
_mm256_store_pd(xr+j+2*v, _mm256_add_pd(ar, bi));
_mm256_store_pd(xi+j+2*v, _mm256_sub_pd(ai, br));
_mm256_store_pd(xr+j+3*v, _mm256_sub_pd(ar, bi));
_mm256_store_pd(xi+j+3*v, _mm256_add_pd(ai, br));
}
u <<= 2;
v >>= 2;
for(i=k-2;v>=4;i-=2){
int jh;
for(jh=0;jh<u;jh++){
double wjr = creal(w[jh<<1]);
double wji = cimag(w[jh<<1]);
double wj2r = creal(w[jh]);
double wj2i = cimag(w[jh]);
double wj3r = wjr*wj2r-wji*wj2i;
double wj3i = wjr*wj2i+wji*wj2r;
int je;
__m256d wjr4 = _mm256_broadcast_sd(& wjr);
__m256d wji4 = _mm256_broadcast_sd(& wji);
__m256d wj2r4 = _mm256_broadcast_sd(& wj2r);
__m256d wj2i4 = _mm256_broadcast_sd(& wj2i);
__m256d wj3r4 = _mm256_broadcast_sd(& wj3r);
__m256d wj3i4 = _mm256_broadcast_sd(& wj3i);
for(j = jh << i, je = j+v;j<je; j+=4){
__m256d tmp0r = _mm256_load_pd(xr+j);
__m256d tmp0i = _mm256_load_pd(xi+j);
__m256d tmp1r = _mm256_load_pd(xr+v+j);
__m256d tmp1i = _mm256_load_pd(xi+v+j);
complex_mul_avx(&tmp1r, &tmp1i, &wjr4, &wji4);
__m256d tmp2r = _mm256_load_pd(xr+2*v+j);
__m256d tmp2i = _mm256_load_pd(xi+2*v+j);
complex_mul_avx(&tmp2r, &tmp2i, &wj2r4, &wj2i4);
__m256d tmp3r = _mm256_load_pd(xr+3*v+j);
__m256d tmp3i = _mm256_load_pd(xi+3*v+j);
complex_mul_avx(&tmp3r, &tmp3i, &wj3r4, &wj3i4);
__m256d ttmp0r = _mm256_add_pd(tmp0r, tmp2r);
__m256d ttmp0i = _mm256_add_pd(tmp0i, tmp2i);
__m256d ttmp2r = _mm256_sub_pd(tmp0r, tmp2r);
__m256d ttmp2i = _mm256_sub_pd(tmp0i, tmp2i);
__m256d ttmp1r = _mm256_add_pd(tmp1r, tmp3r);
__m256d ttmp1i = _mm256_add_pd(tmp1i, tmp3i);
__m256d ttmp3r = _mm256_sub_pd(tmp1i, tmp3i);
__m256d ttmp3i = _mm256_sub_pd(-tmp1r, -tmp3r);
_mm256_store_pd(xr+j, _mm256_add_pd(ttmp0r, ttmp1r));
_mm256_store_pd(xi+j, _mm256_add_pd(ttmp0i, ttmp1i));
_mm256_store_pd(xr+j+v, _mm256_sub_pd(ttmp0r, ttmp1r));
_mm256_store_pd(xi+j+v, _mm256_sub_pd(ttmp0i, ttmp1i));
_mm256_store_pd(xr+j+2*v, _mm256_add_pd(ttmp2r, ttmp3r));
_mm256_store_pd(xi+j+2*v, _mm256_add_pd(ttmp2i, ttmp3i));
_mm256_store_pd(xr+j+3*v, _mm256_sub_pd(ttmp2r, ttmp3r));
_mm256_store_pd(xi+j+3*v, _mm256_sub_pd(ttmp2i, ttmp3i));
}
}
u <<= 2;
v >>= 2;
}
for(;i>0;i-=2){
int jh;
for(jh=0;jh<u;jh++){
double wjr = creal(w[jh<<1]);
double wji = cimag(w[jh<<1]);
double wj2r = creal(w[jh]);
double wj2i = cimag(w[jh]);
double wj3r = wjr*wj2r-wji*wj2i;
double wj3i = wjr*wj2i+wji*wj2r;
int je;
for(j = jh << i, je = j+v;j<je; j++){
double tmp0r = xr[j];
double tmp0i = xi[j];
double tmp1r = xr[j+v];
double tmp1i = xi[j+v];
complex_mul(&tmp1r, &tmp1i, wjr, wji);
double tmp2r = xr[j+2*v];
double tmp2i = xi[j+2*v];
complex_mul(&tmp2r, &tmp2i, wj2r, wj2i);
double tmp3r = xr[j+3*v];
double tmp3i = xi[j+3*v];
complex_mul(&tmp3r, &tmp3i, wj3r, wj3i);
double ttmp0r = tmp0r + tmp2r;
double ttmp0i = tmp0i + tmp2i;
double ttmp2r = tmp0r - tmp2r;
double ttmp2i = tmp0i - tmp2i;
double ttmp1r = tmp1r + tmp3r;
double ttmp1i = tmp1i + tmp3i;
double ttmp3r = tmp1i - tmp3i;
double ttmp3i = -(tmp1r - tmp3r);
xr[j] = ttmp0r + ttmp1r;
xi[j] = ttmp0i + ttmp1i;
xr[j+v] = ttmp0r - ttmp1r;
xi[j+v] = ttmp0i - ttmp1i;
xr[j+2*v] = ttmp2r + ttmp3r;
xi[j+2*v] = ttmp2i + ttmp3i;
xr[j+3*v] = ttmp2r - ttmp3r;
xi[j+3*v] = ttmp2i - ttmp3i;
}
}
u <<= 2;
v >>= 2;
}
}
__attribute__((target("arch=sandybridge")))
void ifft_avx(int k, double *xr, double *xi, const cmplx *w){
const int m = 1 << k;
int u = m/4;
int v = 1;
int i, j;
if(k>=2){
int jh;
for(jh=0;jh<u;jh++){
double wjr = creal(w[jh<<1]);
double wji = -cimag(w[jh<<1]);
double wj2r = creal(w[jh]);
double wj2i = -cimag(w[jh]);
double wj3r = wjr*wj2r-wji*wj2i;
double wj3i = wjr*wj2i+wji*wj2r;
j = jh << 2;
double tmp0r = xr[j];
double tmp0i = xi[j];
double tmp1r = xr[j+v];
double tmp1i = xi[j+v];
double tmp2r = xr[j+2*v];
double tmp2i = xi[j+2*v];
double tmp3r = xr[j+3*v];
double tmp3i = xi[j+3*v];
double ttmp0r = tmp0r + tmp1r;
double ttmp0i = tmp0i + tmp1i;
double ttmp1r = tmp0r - tmp1r;
double ttmp1i = tmp0i - tmp1i;
double ttmp2r = tmp2r + tmp3r;
double ttmp2i = tmp2i + tmp3i;
double ttmp3r = -(tmp2i - tmp3i);
double ttmp3i = tmp2r - tmp3r;
xr[j] = ttmp0r + ttmp2r;
xi[j] = ttmp0i + ttmp2i;
xr[j+v] = ttmp1r + ttmp3r;
xi[j+v] = ttmp1i + ttmp3i;
xr[j+2*v] = ttmp0r - ttmp2r;
xi[j+2*v] = ttmp0i - ttmp2i;
xr[j+3*v] = ttmp1r - ttmp3r;
xi[j+3*v] = ttmp1i - ttmp3i;
complex_mul(xr+j+v, xi+j+v, wjr, wji);
complex_mul(xr+j+2*v, xi+j+2*v, wj2r, wj2i);
complex_mul(xr+j+3*v, xi+j+3*v, wj3r, wj3i);
}
u >>= 2;
v <<= 2;
}
for(i=4;i<=k;i+=2){
int jh;
for(jh=0;jh<u;jh++){
double wjr = creal(w[jh<<1]);
double wji = -cimag(w[jh<<1]);
double wj2r = creal(w[jh]);
double wj2i = -cimag(w[jh]);
double wj3r = wjr*wj2r-wji*wj2i;
double wj3i = wjr*wj2i+wji*wj2r;
int je;
__m256d wjr4 = _mm256_broadcast_sd(& wjr);
__m256d wji4 = _mm256_broadcast_sd(& wji);
__m256d wj2r4 = _mm256_broadcast_sd(& wj2r);
__m256d wj2i4 = _mm256_broadcast_sd(& wj2i);
__m256d wj3r4 = _mm256_broadcast_sd(& wj3r);
__m256d wj3i4 = _mm256_broadcast_sd(& wj3i);
for(j = jh << i, je = j+v;j<je; j+=4){
__m256d tmp0r = _mm256_load_pd(xr+j);
__m256d tmp0i = _mm256_load_pd(xi+j);
__m256d tmp1r = _mm256_load_pd(xr+v+j);
__m256d tmp1i = _mm256_load_pd(xi+v+j);
__m256d tmp2r = _mm256_load_pd(xr+2*v+j);
__m256d tmp2i = _mm256_load_pd(xi+2*v+j);
__m256d tmp3r = _mm256_load_pd(xr+3*v+j);
__m256d tmp3i = _mm256_load_pd(xi+3*v+j);
__m256d ttmp0r = _mm256_add_pd(tmp0r, tmp1r);
__m256d ttmp0i = _mm256_add_pd(tmp0i, tmp1i);
__m256d ttmp1r = _mm256_sub_pd(tmp0r, tmp1r);
__m256d ttmp1i = _mm256_sub_pd(tmp0i, tmp1i);
__m256d ttmp2r = _mm256_add_pd(tmp2r, tmp3r);
__m256d ttmp2i = _mm256_add_pd(tmp2i, tmp3i);
__m256d ttmp3r = _mm256_sub_pd(-tmp2i, -tmp3i);
__m256d ttmp3i = _mm256_sub_pd(tmp2r, tmp3r);
__m256d tttmp0r = _mm256_add_pd(ttmp0r, ttmp2r);
__m256d tttmp0i = _mm256_add_pd(ttmp0i, ttmp2i);
__m256d tttmp2r = _mm256_sub_pd(ttmp0r, ttmp2r);
__m256d tttmp2i = _mm256_sub_pd(ttmp0i, ttmp2i);
__m256d tttmp1r = _mm256_add_pd(ttmp1r, ttmp3r);
__m256d tttmp1i = _mm256_add_pd(ttmp1i, ttmp3i);
__m256d tttmp3r = _mm256_sub_pd(ttmp1r, ttmp3r);
__m256d tttmp3i = _mm256_sub_pd(ttmp1i, ttmp3i);
complex_mul_avx(&tttmp1r, &tttmp1i, &wjr4, &wji4);
complex_mul_avx(&tttmp2r, &tttmp2i, &wj2r4, &wj2i4);
complex_mul_avx(&tttmp3r, &tttmp3i, &wj3r4, &wj3i4);
_mm256_store_pd(xr+j, tttmp0r);
_mm256_store_pd(xi+j, tttmp0i);
_mm256_store_pd(xr+j+v, tttmp1r);
_mm256_store_pd(xi+j+v, tttmp1i);
_mm256_store_pd(xr+j+2*v, tttmp2r);
_mm256_store_pd(xi+j+2*v, tttmp2i);
_mm256_store_pd(xr+j+3*v, tttmp3r);
_mm256_store_pd(xi+j+3*v, tttmp3i);
}
}
u >>= 2;
v <<= 2;
}
if(k&1){
for(j=0; j<m/2; j+=4){
__m256d ar = _mm256_load_pd(xr+j);
__m256d ai = _mm256_load_pd(xi+j);
__m256d br = _mm256_load_pd(xr+j+m/2);
__m256d bi = _mm256_load_pd(xi+j+m/2);
_mm256_store_pd(xr+j, _mm256_add_pd(ar, br));
_mm256_store_pd(xi+j, _mm256_add_pd(ai, bi));
_mm256_store_pd(xr+j+m/2, _mm256_sub_pd(ar, br));
_mm256_store_pd(xi+j+m/2, _mm256_sub_pd(ai, bi));
}
}
}
__attribute__((target("arch=sandybridge")))
void genw(int i, int b, cmplx z){
if(b == 0){
w[i] = z;
}
else {
genw(i, b>>1, z);
genw(i|b, b>>1, z*w[b]);
}
}
__attribute__((target("arch=sandybridge")))
int main(){
int n;
const double arg = -PI2/m;
for(int i=1, j=m/4; j; i<<=1, j>>=1){
w[i] = cexp(I * (arg * j));
}
genw(0, m/4, 1);
readall();
n = read_uint();
for(int i=1;i<=n;i++){
Cr[i] = read_uint();
Ci[i] = read_uint();
}
// k = 33 - __builtin_clz(n);
// m = 1 << k;
fft_avx(k, Cr, Ci, w);
Ci[0] = 4 * Cr[0] * Ci[0];
Ci[1] = 4 * Cr[1] * Ci[1];
Cr[0] = Cr[1] = 0;
for(int i = 2; i < m; i+=2){
int y = 1 << (31-__builtin_clz(i));
int j = i^(y-1);
double tmpr = Cr[i] - Cr[j];
double tmpi = Ci[i] + Ci[j];
Cr[i] += Cr[j];
Ci[i] -= Ci[j];
complex_mul(Cr+i, Ci+i, tmpr, tmpi);
Cr[j] = -Cr[i];
Ci[j] = Ci[i];
}
for(int i = 0; i < m; i+=2){
double ttmpr = Cr[i] + Cr[i^1];
double ttmpi = Ci[i] + Ci[i^1];
double tmpr = Cr[i] - Cr[i^1];
double tmpi = Ci[i] - Ci[i^1];
complex_mul(&tmpr, &tmpi, creal(w[i/2]), cimag(w[i/2]));
Cr[i/2] = ttmpr + tmpi;
Ci[i/2] = ttmpi - tmpr;
}
ifft_avx(k-1, Cr, Ci, w);
for(int i=1;i<=n;i++){
write_uint((Cr[i])/(4*m)+0.5);
write_uint((Ci[i])/(4*m)+0.5);
}
flush();
}
Submission Info
Submission Time |
|
Task |
C - 高速フーリエ変換 |
User |
ryuhei |
Language |
C (Clang 3.8.0) |
Score |
100 |
Code Size |
13629 Byte |
Status |
AC |
Exec Time |
17 ms |
Memory |
10624 KB |
Judge Result
Set Name |
Sample |
All |
Score / Max Score |
0 / 0 |
100 / 100 |
Status |
|
|
Set Name |
Test Cases |
Sample |
00_sample_01 |
All |
00_sample_01, 01_00_01, 01_01_19, 01_02_31, 01_03_22, 01_04_31, 01_05_40, 01_06_15, 01_07_39, 01_08_28, 01_09_30, 01_10_23, 01_11_33, 01_12_11, 01_13_28, 01_14_41, 01_15_26, 01_16_49, 01_17_34, 01_18_02, 01_19_33, 01_20_29, 02_00_51254, 02_01_82431, 02_02_17056, 02_03_34866, 02_04_6779, 02_05_65534, 02_06_65535, 02_07_65536, 02_08_65537, 02_09_65538, 02_10_100000 |
Case Name |
Status |
Exec Time |
Memory |
00_sample_01 |
AC |
11 ms |
8704 KB |
01_00_01 |
AC |
9 ms |
8320 KB |
01_01_19 |
AC |
9 ms |
8320 KB |
01_02_31 |
AC |
9 ms |
8320 KB |
01_03_22 |
AC |
9 ms |
8320 KB |
01_04_31 |
AC |
9 ms |
8320 KB |
01_05_40 |
AC |
9 ms |
8320 KB |
01_06_15 |
AC |
9 ms |
8320 KB |
01_07_39 |
AC |
9 ms |
8320 KB |
01_08_28 |
AC |
9 ms |
8320 KB |
01_09_30 |
AC |
8 ms |
8320 KB |
01_10_23 |
AC |
9 ms |
8320 KB |
01_11_33 |
AC |
8 ms |
8320 KB |
01_12_11 |
AC |
9 ms |
8320 KB |
01_13_28 |
AC |
9 ms |
8320 KB |
01_14_41 |
AC |
9 ms |
8320 KB |
01_15_26 |
AC |
9 ms |
8320 KB |
01_16_49 |
AC |
9 ms |
8320 KB |
01_17_34 |
AC |
9 ms |
8320 KB |
01_18_02 |
AC |
9 ms |
8320 KB |
01_19_33 |
AC |
9 ms |
8320 KB |
01_20_29 |
AC |
9 ms |
8320 KB |
02_00_51254 |
AC |
13 ms |
9472 KB |
02_01_82431 |
AC |
15 ms |
10240 KB |
02_02_17056 |
AC |
10 ms |
8576 KB |
02_03_34866 |
AC |
11 ms |
9088 KB |
02_04_6779 |
AC |
9 ms |
8448 KB |
02_05_65534 |
AC |
14 ms |
9856 KB |
02_06_65535 |
AC |
14 ms |
9856 KB |
02_07_65536 |
AC |
14 ms |
9856 KB |
02_08_65537 |
AC |
14 ms |
9856 KB |
02_09_65538 |
AC |
14 ms |
9856 KB |
02_10_100000 |
AC |
17 ms |
10624 KB |