Submission #1164332


Source Code Expand

#include <unistd.h>
#include <complex.h>
#include <immintrin.h>

#define PI2 6.28318530717958647692

const int k = 18;
const int m = (1 << k);

typedef double complex cmplx;

double __attribute__ ((aligned(32))) Cr[1<<k];
double __attribute__ ((aligned(32))) Ci[1<<k];
cmplx __attribute__ ((aligned(32))) w[1<<(k-1)];


const int mod10[200] = {
0,1,2,3,4,5,6,7,8,9,0,1,2,3,4,5,6,7,8,9,0,1,2,3,4,5,6,7,8,9,0,1,2,3,4,5,6,7,8,9,0,1,2,3,4,5,6,7,8,9,
0,1,2,3,4,5,6,7,8,9,0,1,2,3,4,5,6,7,8,9,0,1,2,3,4,5,6,7,8,9,0,1,2,3,4,5,6,7,8,9,0,1,2,3,4,5,6,7,8,9,
0,1,2,3,4,5,6,7,8,9,0,1,2,3,4,5,6,7,8,9,0,1,2,3,4,5,6,7,8,9,0,1,2,3,4,5,6,7,8,9,0,1,2,3,4,5,6,7,8,9,
0,1,2,3,4,5,6,7,8,9,0,1,2,3,4,5,6,7,8,9,0,1,2,3,4,5,6,7,8,9,0,1,2,3,4,5,6,7,8,9,0,1,2,3,4,5,6,7,8,9
};

const int div10[200] = {
0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,4,4,
5,5,5,5,5,5,5,5,5,5,6,6,6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7,7,7,8,8,8,8,8,8,8,8,8,8,9,9,9,9,9,9,9,9,9,9,
10,10,10,10,10,10,10,10,10,10,11,11,11,11,11,11,11,11,11,11,12,12,12,12,12,12,12,12,12,12,13,13,13,13,13,13,13,13,13,13,
14,14,14,14,14,14,14,14,14,14,15,15,15,15,15,15,15,15,15,15,16,16,16,16,16,16,16,16,16,16,17,17,17,17,17,17,17,17,17,17,
18,18,18,18,18,18,18,18,18,18,19,19,19,19,19,19,19,19,19,19
};

char ibuf[800008];
char *ibufe = ibuf-1;

__attribute__((target("arch=sandybridge")))
void readall(){
  read(STDIN_FILENO, ibuf, sizeof(ibuf));
}

__attribute__((target("arch=sandybridge")))
int read_uint(){
  int x=0;
  while(*(++ibufe) <'0');
  do {
    x *= 10;
    x += *ibufe-'0';
  } while(*(++ibufe) >='0');

  return x;
}

char buf[1977782];
char *bufe = buf;

__attribute__((target("arch=sandybridge")))
void write_uint(int x){
  int i;
  static char tmp[13];
  if(x==0){
    *bufe++ = '0';
    *bufe++ = '\n';
    return;
  }

  for(i=0; x; i++){
    if(x >= 200){
      tmp[i] = x % 10;
      x /= 10;
    }
    else {
      tmp[i] = mod10[x];
      x = div10[x];
    }
  }
  for(i--; i >= 0; i--){
    *bufe++ = '0' + tmp[i];
  }
  *bufe++ = '\n';
}

__attribute__((target("arch=sandybridge")))
void flush(){
  write(STDOUT_FILENO, buf, bufe-buf);
}


__attribute__((target("arch=sandybridge")))
void complex_mul_avx(__m256d *ar, __m256d *ai, __m256d *br, __m256d *bi){
  __m256d arbr = _mm256_mul_pd(*ar, *br);
  __m256d aibi = _mm256_mul_pd(*ai, *bi);
  __m256d arbi = _mm256_mul_pd(*ar, *bi);
  __m256d aibr = _mm256_mul_pd(*ai, *br);
  *ar = _mm256_sub_pd(arbr, aibi);
  *ai = _mm256_add_pd(arbi, aibr);
/*
  __m256d aibi = _mm256_mul_pd(*ai, *bi);
  __m256d aibr = _mm256_mul_pd(*ai, *br);
  *ai = _mm256_fmadd_pd(*ar, *bi, aibr);
  *ar = _mm256_fmsub_pd(*ar, *br, aibi);
*/
}

__attribute__((target("arch=sandybridge")))
void complex_mul(double *ar, double *ai, double br, double bi){
  double tmpar = *ar * br - *ai * bi;
  *ai = *ar * bi + *ai * br;
  *ar = tmpar;
}

__attribute__((target("arch=sandybridge")))
void fft_avx(int k, double *xr, double *xi,  const cmplx *w){
  const int m = 1 << k;
  int u = 1;
  int v = m/4;
  int i, j;

  for(int j = 0;j<v; j+=4){
    __m256d ar = _mm256_load_pd(xr+j);
    __m256d ai = _mm256_load_pd(xi+j);
    __m256d br = _mm256_load_pd(xr+j+v);
    __m256d bi = _mm256_load_pd(xi+j+v);
    _mm256_store_pd(xr+j, _mm256_add_pd(ar, br));
    _mm256_store_pd(xi+j, _mm256_add_pd(ai, bi));
    _mm256_store_pd(xr+j+v, _mm256_sub_pd(ar, br));
    _mm256_store_pd(xi+j+v, _mm256_sub_pd(ai, bi));
    _mm256_store_pd(xr+j+2*v, _mm256_add_pd(ar, bi));
    _mm256_store_pd(xi+j+2*v, _mm256_sub_pd(ai, br));
    _mm256_store_pd(xr+j+3*v, _mm256_sub_pd(ar, bi));
    _mm256_store_pd(xi+j+3*v, _mm256_add_pd(ai, br));
  }
  u <<= 2;
  v >>= 2;

  for(i=k-2;v>=4;i-=2){
    int jh;
    for(jh=0;jh<u;jh++){
      double wjr = creal(w[jh<<1]);
      double wji = cimag(w[jh<<1]);
      double wj2r = creal(w[jh]);
      double wj2i = cimag(w[jh]);
      double wj3r = wjr*wj2r-wji*wj2i;
      double wj3i = wjr*wj2i+wji*wj2r;
      int je;
        __m256d wjr4 = _mm256_broadcast_sd(& wjr);
        __m256d wji4 = _mm256_broadcast_sd(& wji);
        __m256d wj2r4 = _mm256_broadcast_sd(& wj2r);
        __m256d wj2i4 = _mm256_broadcast_sd(& wj2i);
        __m256d wj3r4 = _mm256_broadcast_sd(& wj3r);
        __m256d wj3i4 = _mm256_broadcast_sd(& wj3i);
        for(j = jh << i, je = j+v;j<je; j+=4){
          __m256d tmp0r = _mm256_load_pd(xr+j);
          __m256d tmp0i = _mm256_load_pd(xi+j);
          __m256d tmp1r = _mm256_load_pd(xr+v+j);
          __m256d tmp1i = _mm256_load_pd(xi+v+j);
          complex_mul_avx(&tmp1r, &tmp1i, &wjr4, &wji4);
          __m256d tmp2r = _mm256_load_pd(xr+2*v+j);
          __m256d tmp2i = _mm256_load_pd(xi+2*v+j);
          complex_mul_avx(&tmp2r, &tmp2i, &wj2r4, &wj2i4);
          __m256d tmp3r = _mm256_load_pd(xr+3*v+j);
          __m256d tmp3i = _mm256_load_pd(xi+3*v+j);
          complex_mul_avx(&tmp3r, &tmp3i, &wj3r4, &wj3i4);

          __m256d ttmp0r = _mm256_add_pd(tmp0r, tmp2r);
          __m256d ttmp0i = _mm256_add_pd(tmp0i, tmp2i);
          __m256d ttmp2r = _mm256_sub_pd(tmp0r, tmp2r);
          __m256d ttmp2i = _mm256_sub_pd(tmp0i, tmp2i);
          __m256d ttmp1r = _mm256_add_pd(tmp1r, tmp3r);
          __m256d ttmp1i = _mm256_add_pd(tmp1i, tmp3i);
          __m256d ttmp3r = _mm256_sub_pd(tmp1i, tmp3i);
          __m256d ttmp3i = _mm256_sub_pd(-tmp1r, -tmp3r);

          _mm256_store_pd(xr+j, _mm256_add_pd(ttmp0r, ttmp1r));
          _mm256_store_pd(xi+j, _mm256_add_pd(ttmp0i, ttmp1i));
          _mm256_store_pd(xr+j+v, _mm256_sub_pd(ttmp0r, ttmp1r));
          _mm256_store_pd(xi+j+v, _mm256_sub_pd(ttmp0i, ttmp1i));
          _mm256_store_pd(xr+j+2*v, _mm256_add_pd(ttmp2r, ttmp3r));
          _mm256_store_pd(xi+j+2*v, _mm256_add_pd(ttmp2i, ttmp3i));
          _mm256_store_pd(xr+j+3*v, _mm256_sub_pd(ttmp2r, ttmp3r));
          _mm256_store_pd(xi+j+3*v, _mm256_sub_pd(ttmp2i, ttmp3i));
      }
    }
    u <<= 2;
    v >>= 2;
  }
  for(;i>0;i-=2){
    int jh;
    for(jh=0;jh<u;jh++){
      double wjr = creal(w[jh<<1]);
      double wji = cimag(w[jh<<1]);
      double wj2r = creal(w[jh]);
      double wj2i = cimag(w[jh]);
      double wj3r = wjr*wj2r-wji*wj2i;
      double wj3i = wjr*wj2i+wji*wj2r;
      int je;
      for(j = jh << i, je = j+v;j<je; j++){
        double tmp0r = xr[j];
        double tmp0i = xi[j];
        double tmp1r = xr[j+v];
        double tmp1i = xi[j+v];
        complex_mul(&tmp1r, &tmp1i, wjr, wji);
        double tmp2r = xr[j+2*v];
        double tmp2i = xi[j+2*v];
        complex_mul(&tmp2r, &tmp2i, wj2r, wj2i);
        double tmp3r = xr[j+3*v];
        double tmp3i = xi[j+3*v];
        complex_mul(&tmp3r, &tmp3i, wj3r, wj3i);

        double ttmp0r = tmp0r + tmp2r;
        double ttmp0i = tmp0i + tmp2i;
        double ttmp2r = tmp0r - tmp2r;
        double ttmp2i = tmp0i - tmp2i;
        double ttmp1r = tmp1r + tmp3r;
        double ttmp1i = tmp1i + tmp3i;
        double ttmp3r = tmp1i - tmp3i;
        double ttmp3i = -(tmp1r - tmp3r);

        xr[j] = ttmp0r + ttmp1r;
        xi[j] = ttmp0i + ttmp1i;
        xr[j+v] = ttmp0r - ttmp1r;
        xi[j+v] = ttmp0i - ttmp1i;
        xr[j+2*v] = ttmp2r + ttmp3r;
        xi[j+2*v] = ttmp2i + ttmp3i;
        xr[j+3*v] = ttmp2r - ttmp3r;
        xi[j+3*v] = ttmp2i - ttmp3i;
      }
    }
    u <<= 2;
    v >>= 2;
  }
}

__attribute__((target("arch=sandybridge")))
void ifft_avx(int k, double *xr, double *xi,  const cmplx *w){
  const int m = 1 << k;
  int u = m/4;
  int v = 1;
  int i, j;


  if(k>=2){
    int jh;
    for(jh=0;jh<u;jh++){
      double wjr = creal(w[jh<<1]);
      double wji = -cimag(w[jh<<1]);
      double wj2r = creal(w[jh]);
      double wj2i = -cimag(w[jh]);
      double wj3r = wjr*wj2r-wji*wj2i;
      double wj3i = wjr*wj2i+wji*wj2r;
      j = jh << 2;
        double tmp0r = xr[j];
        double tmp0i = xi[j];
        double tmp1r = xr[j+v];
        double tmp1i = xi[j+v];
        double tmp2r = xr[j+2*v];
        double tmp2i = xi[j+2*v];
        double tmp3r = xr[j+3*v];
        double tmp3i = xi[j+3*v];

        double ttmp0r = tmp0r + tmp1r;
        double ttmp0i = tmp0i + tmp1i;
        double ttmp1r = tmp0r - tmp1r;
        double ttmp1i = tmp0i - tmp1i;
        double ttmp2r = tmp2r + tmp3r;
        double ttmp2i = tmp2i + tmp3i;
        double ttmp3r = -(tmp2i - tmp3i);
        double ttmp3i = tmp2r - tmp3r;

        xr[j] = ttmp0r + ttmp2r;
        xi[j] = ttmp0i + ttmp2i;
        xr[j+v] = ttmp1r + ttmp3r;
        xi[j+v] = ttmp1i + ttmp3i;
        xr[j+2*v] = ttmp0r - ttmp2r;
        xi[j+2*v] = ttmp0i - ttmp2i;
        xr[j+3*v] = ttmp1r - ttmp3r;
        xi[j+3*v] = ttmp1i - ttmp3i;

        complex_mul(xr+j+v, xi+j+v, wjr, wji);
        complex_mul(xr+j+2*v, xi+j+2*v, wj2r, wj2i);
        complex_mul(xr+j+3*v, xi+j+3*v, wj3r, wj3i);
    }
    u >>= 2;
    v <<= 2;
  }

  for(i=4;i<=k;i+=2){
    int jh;
    for(jh=0;jh<u;jh++){
      double wjr = creal(w[jh<<1]);
      double wji = -cimag(w[jh<<1]);
      double wj2r = creal(w[jh]);
      double wj2i = -cimag(w[jh]);
      double wj3r = wjr*wj2r-wji*wj2i;
      double wj3i = wjr*wj2i+wji*wj2r;
      int je;
        __m256d wjr4 = _mm256_broadcast_sd(& wjr);
        __m256d wji4 = _mm256_broadcast_sd(& wji);
        __m256d wj2r4 = _mm256_broadcast_sd(& wj2r);
        __m256d wj2i4 = _mm256_broadcast_sd(& wj2i);
        __m256d wj3r4 = _mm256_broadcast_sd(& wj3r);
        __m256d wj3i4 = _mm256_broadcast_sd(& wj3i);
        for(j = jh << i, je = j+v;j<je; j+=4){
          __m256d tmp0r = _mm256_load_pd(xr+j);
          __m256d tmp0i = _mm256_load_pd(xi+j);
          __m256d tmp1r = _mm256_load_pd(xr+v+j);
          __m256d tmp1i = _mm256_load_pd(xi+v+j);
          __m256d tmp2r = _mm256_load_pd(xr+2*v+j);
          __m256d tmp2i = _mm256_load_pd(xi+2*v+j);
          __m256d tmp3r = _mm256_load_pd(xr+3*v+j);
          __m256d tmp3i = _mm256_load_pd(xi+3*v+j);

          __m256d ttmp0r = _mm256_add_pd(tmp0r, tmp1r);
          __m256d ttmp0i = _mm256_add_pd(tmp0i, tmp1i);
          __m256d ttmp1r = _mm256_sub_pd(tmp0r, tmp1r);
          __m256d ttmp1i = _mm256_sub_pd(tmp0i, tmp1i);
          __m256d ttmp2r = _mm256_add_pd(tmp2r, tmp3r);
          __m256d ttmp2i = _mm256_add_pd(tmp2i, tmp3i);
          __m256d ttmp3r = _mm256_sub_pd(-tmp2i, -tmp3i);
          __m256d ttmp3i = _mm256_sub_pd(tmp2r, tmp3r);

          __m256d tttmp0r = _mm256_add_pd(ttmp0r, ttmp2r);
          __m256d tttmp0i = _mm256_add_pd(ttmp0i, ttmp2i);
          __m256d tttmp2r = _mm256_sub_pd(ttmp0r, ttmp2r);
          __m256d tttmp2i = _mm256_sub_pd(ttmp0i, ttmp2i);
          __m256d tttmp1r = _mm256_add_pd(ttmp1r, ttmp3r);
          __m256d tttmp1i = _mm256_add_pd(ttmp1i, ttmp3i);
          __m256d tttmp3r = _mm256_sub_pd(ttmp1r, ttmp3r);
          __m256d tttmp3i = _mm256_sub_pd(ttmp1i, ttmp3i);

          complex_mul_avx(&tttmp1r, &tttmp1i, &wjr4, &wji4);
          complex_mul_avx(&tttmp2r, &tttmp2i, &wj2r4, &wj2i4);
          complex_mul_avx(&tttmp3r, &tttmp3i, &wj3r4, &wj3i4);

          _mm256_store_pd(xr+j, tttmp0r);
          _mm256_store_pd(xi+j, tttmp0i);
          _mm256_store_pd(xr+j+v, tttmp1r);
          _mm256_store_pd(xi+j+v, tttmp1i);
          _mm256_store_pd(xr+j+2*v, tttmp2r);
          _mm256_store_pd(xi+j+2*v, tttmp2i);
          _mm256_store_pd(xr+j+3*v, tttmp3r);
          _mm256_store_pd(xi+j+3*v, tttmp3i);
      }
    }
    u >>= 2;
    v <<= 2;
  }
  if(k&1){
    for(j=0; j<m/2; j+=4){
      __m256d ar = _mm256_load_pd(xr+j);
      __m256d ai = _mm256_load_pd(xi+j);
      __m256d br = _mm256_load_pd(xr+j+m/2);
      __m256d bi = _mm256_load_pd(xi+j+m/2);
      _mm256_store_pd(xr+j, _mm256_add_pd(ar, br));
      _mm256_store_pd(xi+j, _mm256_add_pd(ai, bi));
      _mm256_store_pd(xr+j+m/2, _mm256_sub_pd(ar, br));
      _mm256_store_pd(xi+j+m/2, _mm256_sub_pd(ai, bi));
    }
  }
}



__attribute__((target("arch=sandybridge")))
void genw(int i, int b, cmplx z){
  if(b == 0){
    w[i] = z;
  }
  else {
    genw(i, b>>1, z);
    genw(i|b, b>>1, z*w[b]);
  }
}

__attribute__((target("arch=sandybridge")))
int main(){
  int n;
  const double arg = -PI2/m;

  for(int i=1, j=m/4; j; i<<=1, j>>=1){
    w[i] = cexp(I * (arg * j));
  }
  genw(0, m/4, 1);

  readall();
  n = read_uint();
  for(int i=1;i<=n;i++){
    Cr[i] = read_uint();
    Ci[i] = read_uint();
  }
//  k = 33 - __builtin_clz(n);
//  m = 1 << k;

  fft_avx(k, Cr, Ci, w);

  Ci[0] = 4 * Cr[0] * Ci[0];
  Ci[1] = 4 * Cr[1] * Ci[1];
  Cr[0] = Cr[1] = 0;

  for(int i = 2; i < m; i+=2){
    int y = 1 << (31-__builtin_clz(i));
    int j = i^(y-1);
    double tmpr = Cr[i] - Cr[j];
    double tmpi = Ci[i] + Ci[j];
    Cr[i] += Cr[j];
    Ci[i] -= Ci[j];
    complex_mul(Cr+i, Ci+i, tmpr, tmpi);
    Cr[j] = -Cr[i];
    Ci[j] =  Ci[i];
  }

  for(int i = 0; i < m; i+=2){
    double ttmpr = Cr[i] + Cr[i^1];
    double ttmpi = Ci[i] + Ci[i^1];
    double tmpr = Cr[i] - Cr[i^1];
    double tmpi = Ci[i] - Ci[i^1];
    complex_mul(&tmpr, &tmpi, creal(w[i/2]), cimag(w[i/2]));
    Cr[i/2] = ttmpr + tmpi;
    Ci[i/2] = ttmpi - tmpr;
  }

  ifft_avx(k-1, Cr, Ci, w);

  for(int i=1;i<=n;i++){
    write_uint((Cr[i])/(4*m)+0.5);
    write_uint((Ci[i])/(4*m)+0.5);
  }
  flush();
}

Submission Info

Submission Time
Task C - 高速フーリエ変換
User ryuhei
Language C (Clang 3.8.0)
Score 100
Code Size 13629 Byte
Status AC
Exec Time 17 ms
Memory 10624 KB

Judge Result

Set Name Sample All
Score / Max Score 0 / 0 100 / 100
Status
AC × 1
AC × 33
Set Name Test Cases
Sample 00_sample_01
All 00_sample_01, 01_00_01, 01_01_19, 01_02_31, 01_03_22, 01_04_31, 01_05_40, 01_06_15, 01_07_39, 01_08_28, 01_09_30, 01_10_23, 01_11_33, 01_12_11, 01_13_28, 01_14_41, 01_15_26, 01_16_49, 01_17_34, 01_18_02, 01_19_33, 01_20_29, 02_00_51254, 02_01_82431, 02_02_17056, 02_03_34866, 02_04_6779, 02_05_65534, 02_06_65535, 02_07_65536, 02_08_65537, 02_09_65538, 02_10_100000
Case Name Status Exec Time Memory
00_sample_01 AC 11 ms 8704 KB
01_00_01 AC 9 ms 8320 KB
01_01_19 AC 9 ms 8320 KB
01_02_31 AC 9 ms 8320 KB
01_03_22 AC 9 ms 8320 KB
01_04_31 AC 9 ms 8320 KB
01_05_40 AC 9 ms 8320 KB
01_06_15 AC 9 ms 8320 KB
01_07_39 AC 9 ms 8320 KB
01_08_28 AC 9 ms 8320 KB
01_09_30 AC 8 ms 8320 KB
01_10_23 AC 9 ms 8320 KB
01_11_33 AC 8 ms 8320 KB
01_12_11 AC 9 ms 8320 KB
01_13_28 AC 9 ms 8320 KB
01_14_41 AC 9 ms 8320 KB
01_15_26 AC 9 ms 8320 KB
01_16_49 AC 9 ms 8320 KB
01_17_34 AC 9 ms 8320 KB
01_18_02 AC 9 ms 8320 KB
01_19_33 AC 9 ms 8320 KB
01_20_29 AC 9 ms 8320 KB
02_00_51254 AC 13 ms 9472 KB
02_01_82431 AC 15 ms 10240 KB
02_02_17056 AC 10 ms 8576 KB
02_03_34866 AC 11 ms 9088 KB
02_04_6779 AC 9 ms 8448 KB
02_05_65534 AC 14 ms 9856 KB
02_06_65535 AC 14 ms 9856 KB
02_07_65536 AC 14 ms 9856 KB
02_08_65537 AC 14 ms 9856 KB
02_09_65538 AC 14 ms 9856 KB
02_10_100000 AC 17 ms 10624 KB