Mono.Simdを用いることで、SIMD命令を用いたベクタ演算をマネージドコードで記述できる。

例1 加算合成

二枚の8ビットARGB画像から飽和加算による加算合成画像を得る例。 4ピクセル分(16バイト)をVector16b構造体として扱い、VectorOperations.AddWithSaturationメソッドで飽和加算する。

// gmcs -unsafe -r:System,System.Drawing,Mono.Simd addition.cs
using Mono.Simd;

using System;
using System.Diagnostics;
using System.Drawing;
using System.Drawing.Imaging;

class Addition {
  private static unsafe void AddSimd(BitmapData result, BitmapData image1, BitmapData image2, int w, int h)
  {
    for (var y = 0; y < h; y++) {
      var resultPixel = (Vector16b*)((byte*)result.Scan0.ToPointer() + y * result.Stride);
      var image1Pixel = (Vector16b*)((byte*)image1.Scan0.ToPointer() + y * image1.Stride);
      var image2Pixel = (Vector16b*)((byte*)image2.Scan0.ToPointer() + y * image2.Stride);

      for (var x = 0; x < w; x += 4) {
        *(resultPixel++) = VectorOperations.AddWithSaturation(*(image1Pixel++), *(image2Pixel++));
      }
    }
  }

  private static unsafe void AddSisd(BitmapData result, BitmapData image1, BitmapData image2, int w, int h)
  {
    for (var y = 0; y < h; y++) {
      var resultPixel = (byte*)result.Scan0.ToPointer() + y * result.Stride;
      var image1Pixel = (byte*)image1.Scan0.ToPointer() + y * image1.Stride;
      var image2Pixel = (byte*)image2.Scan0.ToPointer() + y * image2.Stride;

      for (var x = 0; x < w; x++) {
        int b = *(image1Pixel++) + *(image2Pixel++);
        int g = *(image1Pixel++) + *(image2Pixel++);
        int r = *(image1Pixel++) + *(image2Pixel++);
        int a = *(image1Pixel++) + *(image2Pixel++);

        *(resultPixel++) = (byte)((0xff < b) ? 0xff : b);
        *(resultPixel++) = (byte)((0xff < g) ? 0xff : g);
        *(resultPixel++) = (byte)((0xff < r) ? 0xff : r);
        *(resultPixel++) = (byte)((0xff < a) ? 0xff : a);
      }
    }
  }

  private static Bitmap Add(bool useSimd, int width, int height, Bitmap image1, Bitmap image2)
  {
    var result = new Bitmap(width, height,PixelFormat.Format32bppArgb);

    BitmapData lockedImage1 = null;
    BitmapData lockedImage2 = null;
    BitmapData lockedResult = null;

    try {
      var rect = new Rectangle(0, 0, width, height);

      lockedImage1 = image1.LockBits(rect, ImageLockMode.ReadOnly,  PixelFormat.Format32bppArgb);
      lockedImage2 = image2.LockBits(rect, ImageLockMode.ReadOnly,  PixelFormat.Format32bppArgb);
      lockedResult = result.LockBits(rect, ImageLockMode.WriteOnly, PixelFormat.Format32bppArgb);

      var stopwatch = new Stopwatch();

      stopwatch.Start();

      if (useSimd)
        AddSimd(lockedResult, lockedImage1, lockedImage2, width, height);
      else
        AddSisd(lockedResult, lockedImage1, lockedImage2, width, height);

      stopwatch.Stop();

      Console.WriteLine("spent {0}", stopwatch.Elapsed);

      return result;
    }
    finally {
      if (lockedImage1 != null)
        image1.UnlockBits(lockedImage1);
      if (lockedImage2 != null)
        image2.UnlockBits(lockedImage2);
      if (lockedResult != null)
        result.UnlockBits(lockedResult);
    }
  }

  static void Main(string[] args)
  {
    var useSimd = (args[0] == "simd");

    if (useSimd) {
      Console.WriteLine("Mono.Simd accel mode: {0}", SimdRuntime.AccelMode);
      Console.WriteLine("acceleration: {0}", SimdRuntime.IsMethodAccelerated(typeof(VectorOperations),
                                                                             "AddWithSaturation",
                                                                             new[] {typeof(Vector16b), typeof(Vector16b)}));
    }

    using (Bitmap image1 = new Bitmap(args[1]), image2 = new Bitmap(args[2])) {
      var width  = image1.Width;
      var height = image1.Height;

      Console.WriteLine("image size {0}x{1}", width, height);

      if (useSimd && (width & 3) != 0)
        throw new ApplicationException("width must be n * 4");

      if (width != image2.Width || height != image2.Height)
        throw new ApplicationException("not same size");

      for (var act = 0; act < 5; act++) {
        var result = Add(useSimd, width, height, image1, image2);

        if (act == 0)
          result.Save(args[3], ImageFormat.Bmp);

        result.Dispose();
      }
    }
  }
}

処理時間。

Mono.Simd使用なし
$ mono addition.exe sisd image1.bmp image2.bmp result.bmp
image size 1280x720
spent 00:00:00.0129155
spent 00:00:00.0120989
spent 00:00:00.0121380
spent 00:00:00.0122019
spent 00:00:00.0121072
Mono.Simd使用あり、SIMD命令使用あり
$ mono addition.exe simd image1.bmp image2.bmp result.bmp
Mono.Simd accel mode: SSE1, SSE2, SSE3, SSSE3
acceleration: True
image size 1280x720
spent 00:00:00.0036350
spent 00:00:00.0029663
spent 00:00:00.0030908
spent 00:00:00.0040916
spent 00:00:00.0030134
Mono.Simd使用あり、SIMD命令使用なし
$ mono -O=-simd addition.exe simd image1.bmp image2.bmp result.bmp
Mono.Simd accel mode: None
acceleration: False
image size 1280x720
spent 00:00:00.0187466
spent 00:00:00.0178911
spent 00:00:00.0177878
spent 00:00:00.0177766
spent 00:00:00.0177772

参考までに、実行した環境・CPUは以下のとおり。

$ mono -V
Mono JIT compiler version 2.5 (/trunk/mono r129678 2009年  3月 18日 水曜日 19:49:22 JST)
Copyright (C) 2002-2008 Novell, Inc and Contributors. www.mono-project.com
	TLS:           __thread
	GC:            Included Boehm (with typed GC and Parallel Mark)
	SIGSEGV:       altstack
	Notifications: epoll
	Architecture:  x86
	Disabled:      none

$ cat /proc/cpuinfo 
processor	: 0
vendor_id	: GenuineIntel
cpu family	: 6
model		: 15
model name	: Intel(R) Core(TM)2 CPU          6600  @ 2.40GHz
stepping	: 6
cpu MHz		: 1596.000
cache size	: 4096 KB
physical id	: 0
siblings	: 2
core id		: 0
cpu cores	: 2
apicid		: 0
initial apicid	: 0
fdiv_bug	: no
hlt_bug		: no
f00f_bug	: no
coma_bug	: no
fpu		: yes
fpu_exception	: yes
cpuid level	: 10
wp		: yes
flags		: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe nx lm constant_tsc arch_perfmon pebs bts pni monitor ds_cpl vmx est tm2 ssse3 cx16 xtpr lahf_lm
bogomips	: 4799.98
clflush size	: 64
power management:

processor	: 1
vendor_id	: GenuineIntel
cpu family	: 6
model		: 15
model name	: Intel(R) Core(TM)2 CPU          6600  @ 2.40GHz
stepping	: 6
cpu MHz		: 1596.000
cache size	: 4096 KB
physical id	: 0
siblings	: 2
core id		: 1
cpu cores	: 2
apicid		: 1
initial apicid	: 1
fdiv_bug	: no
hlt_bug		: no
f00f_bug	: no
coma_bug	: no
fpu		: yes
fpu_exception	: yes
cpuid level	: 10
wp		: yes
flags		: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe nx lm constant_tsc arch_perfmon pebs bts pni monitor ds_cpl vmx est tm2 ssse3 cx16 xtpr lahf_lm
bogomips	: 4800.12
clflush size	: 64
power management:

例2 離散コサイン変換(DCT)

8x8 Forward DCTにより画像を変換し、変換したものを8x8 Inverse DCTで復元する例。 floatで表される4ピクセル分(16バイト)をVector4f構造体として扱い、DCT係数の積和を演算する。

まず処理時間。 使用したソースコードは後述。

Mono.Simd使用なし
$ mono dct.exe sisd image.bmp .
image size 1280x720
forward DCT spent 00:00:00.0861134
inverse DCT spent 00:00:00.0645653
forward DCT spent 00:00:00.0815288
inverse DCT spent 00:00:00.0649667
forward DCT spent 00:00:00.0816030
inverse DCT spent 00:00:00.0668463
forward DCT spent 00:00:00.0815200
inverse DCT spent 00:00:00.0651595
forward DCT spent 00:00:00.0850448
inverse DCT spent 00:00:00.0655908
Mono.Simd使用あり、SIMD命令使用あり
$ mono dct.exe simd image.bmp .
Mono.Simd accel mode: SSE1, SSE2, SSE3, SSSE3
acceleration: Vector4f.op_Addition = True
acceleration: Vector4f.op_Multiply = True
image size 1280x720
forward DCT spent 00:00:00.0366700
inverse DCT spent 00:00:00.0326371
forward DCT spent 00:00:00.0348020
inverse DCT spent 00:00:00.0312227
forward DCT spent 00:00:00.0329452
inverse DCT spent 00:00:00.0329587
forward DCT spent 00:00:00.0489560
inverse DCT spent 00:00:00.0493336
forward DCT spent 00:00:00.0323098
inverse DCT spent 00:00:00.0338064
Mono.Simd使用あり、SIMD命令使用なし
$ mono -O=-simd dct.exe simd image.bmp .
Mono.Simd accel mode: None
acceleration: Vector4f.op_Addition = False
acceleration: Vector4f.op_Multiply = False
image size 1280x720
forward DCT spent 00:00:00.1917066
inverse DCT spent 00:00:00.1856522
forward DCT spent 00:00:00.1853029
inverse DCT spent 00:00:00.1826358
forward DCT spent 00:00:00.1866373
inverse DCT spent 00:00:00.1803647
forward DCT spent 00:00:00.1832263
inverse DCT spent 00:00:00.1786115
forward DCT spent 00:00:00.1827668
inverse DCT spent 00:00:00.1789582

以下は検証に使用したコード。 SIMD版、SISD版ともにX方向とY方向のDCTを別々に演算することで積和の演算回数を減らしている(#define TESTを有効にすると、演算回数の多い実装が有効になる)。 また、SIMD版のDCTでは、処理しやすいようにマトリクスを転置してから積和を求めるようにしてある。

// gmcs -unsafe -r:System,System.Drawing,Mono.Simd dct.cs
using Mono.Simd;

using System;
using System.Diagnostics;
using System.Drawing;
using System.Drawing.Imaging;
using System.IO;
using System.Runtime.InteropServices;

class DCTSample {
  private const PixelFormat Format8bppGrayscale = PixelFormat.Format8bppIndexed;

  private static Vector4f[] coefficientsVector = null;
  private static Vector4f[] transposedCoefficientsVector = null;
  private static float[] coefficientsArray = null;

  private static void InitializeCoefficients(bool simd)
  {
    var coefficients = new float[8 * 8];

    for (var y = 0; y < 8; y++) {
      for (var x = 0; x < 8; x++) {
        coefficients[y * 8 + x] = (x == 0) ? (float)(0.5 / Math.Sqrt(2.0))
                                           : (float)(0.5 * Math.Cos((x * (2 * y + 1) * Math.PI) / 16.0));
      }
    }

    if (simd) {
      coefficientsVector           = new Vector4f[8 * 8 / 4];
      transposedCoefficientsVector = new Vector4f[8 * 8 / 4];

      for (var i = 0; i < 64; i += 4) {
        coefficientsVector[i / 4] = ArrayExtensions.GetVector(coefficients, i);
      }

      for (var i = 0; i < 8; i++) {
        transposedCoefficientsVector[i * 2    ] = new Vector4f(coefficients[i + 0],
                                                               coefficients[i + 8],
                                                               coefficients[i + 16],
                                                               coefficients[i + 24]);
        transposedCoefficientsVector[i * 2 + 1] = new Vector4f(coefficients[i + 32],
                                                               coefficients[i + 40],
                                                               coefficients[i + 48],
                                                               coefficients[i + 56]);
      }
    }
    else {
      coefficientsArray = coefficients;
    }
  }

  private static unsafe void ForwardDCTSimd(BitmapData image, IntPtr dctBuffer, int w, int h)
  {
    var dct_temp = stackalloc float[8 * 8];
    var transposed = stackalloc float[8 * 8];
    var stride = w;

    var scan0_p = (byte*)image.Scan0.ToPointer();
    var scan0_c = (float*)dctBuffer.ToPointer();

    for (var blockY = 0; blockY < h; blockY += 8) {
      var block_p = scan0_p + blockY * image.Stride;
      var block_c = scan0_c + blockY * stride;

      for (var blockX = 0; blockX < w; blockX += 8, block_c += 8, block_p += 8) {
        { // transpose matrix
          var pix_t = transposed;

          for (var tx = 0; tx < 8; tx++) {
            var pix_p_xy = block_p + tx;

            for (var ty = 0; ty < 8; ty++) {
              *(pix_t++) = (float)*pix_p_xy;
              pix_p_xy += image.Stride;
            }
          }
        }

        { // y
          var dct_temp_y = dct_temp;

          for (var v = 0; v < 8; v++) {
            var vec_p_xy = (Vector4f*)transposed;

            for (var x = 0; x < 8; x++) {
              var t  = *(vec_p_xy++) * transposedCoefficientsVector[/* (v * 8 / 4) */ v * 2];
                  t += *(vec_p_xy++) * transposedCoefficientsVector[/* (v * 8 / 4) */ v * 2 + 1];

              *(dct_temp_y++) = t.X + t.Y + t.Z + t.W;
            }
          }
        }

        { // x
          for (var v = 0; v < 8; v++) {
            var pix_c_uv = block_c + v * stride;
            var dct_temp_x = (Vector4f*)(dct_temp + v * 8);

            for (var u = 0; u < 8; u++) {
              var vec_c  = dct_temp_x[0] * transposedCoefficientsVector[/* (u * 8 / 4) */ u * 2];
                  vec_c += dct_temp_x[1] * transposedCoefficientsVector[/* (u * 8 / 4) */ u * 2 + 1];

              pix_c_uv[u] = vec_c.X + vec_c.Y + vec_c.Z + vec_c.W;
            }
          }
        }
      }
    }
  }

  private static unsafe void InverseDCTSimd(IntPtr dctBuffer, BitmapData image, int w, int h)
  {
    var dct_temp = stackalloc float[8 * 8];
    var transposed = stackalloc float[8 * 8];
    var stride = w;

    var scan0_p = (byte*)image.Scan0.ToPointer();
    var scan0_c = (float*)dctBuffer.ToPointer();

    for (var blockY = 0; blockY < h; blockY += 8) {
      var block_p = scan0_p + blockY * image.Stride;
      var block_c = scan0_c + blockY * stride;

      for (var blockX = 0; blockX < w; blockX += 8, block_c += 8, block_p += 8) {
        { // transpose matrix
          var pix_t = transposed;

          for (var tu = 0; tu < 8; tu++) {
            var pix_c_uv = block_c + tu;

            for (var tv = 0; tv < 8; tv++) {
              *(pix_t++) = *pix_c_uv;
              pix_c_uv += stride;
            }
          }
        }

        { // y
          var dct_temp_y = dct_temp;

          for (var y = 0; y < 8; y++) {
            var vec_c_uv = (Vector4f*)transposed;

            for (var u = 0; u < 8; u++) {
              var t  = *(vec_c_uv++) * coefficientsVector[/* (y * 8 / 4) */ y * 2];
                  t += *(vec_c_uv++) * coefficientsVector[/* (y * 8 / 4) */ y * 2 + 1];

              *(dct_temp_y++) = t.X + t.Y + t.Z + t.W;
            }
          }
        }

        { // x
          for (var y = 0; y < 8; y++) {
            var pix_p_xy = block_p + y * image.Stride;
            var dct_temp_x = (Vector4f*)(dct_temp + y * 8);

            for (var x = 0; x < 8; x++) {
              var vec_p  = dct_temp_x[0] * coefficientsVector[/* (x * 8 / 4) */ x * 2];
                  vec_p += dct_temp_x[1] * coefficientsVector[/* (x * 8 / 4) */ x * 2 + 1];

              pix_p_xy[x] = (byte)(vec_p.X + vec_p.Y + vec_p.Z + vec_p.W);
            }
          }
        }
      }
    }
  }

  private static unsafe void ForwardDCTSisd(BitmapData image, IntPtr dctBuffer, int w, int h)
  {
#if !TEST
    var dct_temp = stackalloc float[8 * 8];
#endif
    var stride = w;

    var scan0_p = (byte*)image.Scan0.ToPointer();
    var scan0_c = (float*)dctBuffer.ToPointer();

    for (var blockY = 0; blockY < h; blockY += 8) {
      var block_p = scan0_p + blockY * image.Stride;
      var block_c = scan0_c + blockY * stride;

      for (var blockX = 0; blockX < w; blockX += 8, block_p += 8, block_c += 8) {
#if !TEST
        { // y
          var dct_temp_y = dct_temp;

          for (var v = 0; v < 8; v++) {
            for (var x = 0; x < 8; x++) {
              var pix_p_xy = block_p + x;
              var t = 0.0f;

              for (var y = 0; y < 8; y++) {
                t += *pix_p_xy * coefficientsArray[y * 8 + v];
                pix_p_xy += image.Stride;
              }

              *(dct_temp_y++) = t;
            }
          }
        }

        { // x
          for (var v = 0; v < 8; v++) {
            var pix_c_uv = block_c + v * stride;

            for (var u = 0; u < 8; u++) {
              var dct_temp_x = dct_temp + v * 8;
              var pix_c = 0.0f;

              for (var x = 0; x < 8; x++) {
                pix_c += dct_temp_x[x] * coefficientsArray[x * 8 + u];
              }

              pix_c_uv[u] = pix_c;
            }
          }
        }
#else
        for (var v = 0; v < 8; v++) {
          var pix_c_uv = block_c + v * stride;

          for (var u = 0; u < 8; u++) {
            var pix_c = 0.0f;

            for (var y = 0; y < 8; y++) {
              var pix_p_xy = block_p + y * image.Stride;

              for (var x = 0; x < 8; x++) {
                pix_c += pix_p_xy[x] * coefficientsArray[y * 8 + v] * coefficientsArray[x * 8 + u];
              }
            }

            pix_c_uv[u] = pix_c;
          }
        }
#endif
      }
    }
  }

  private static unsafe void InverseDCTSisd(IntPtr dctBuffer, BitmapData image, int w, int h)
  {
#if !TEST
    var dct_temp = stackalloc float[8 * 8];
#endif
    var stride = w;

    var scan0_p = (byte*)image.Scan0.ToPointer();
    var scan0_c = (float*)dctBuffer.ToPointer();

    for (var blockY = 0; blockY < h; blockY += 8) {
      var block_p = scan0_p + blockY * image.Stride;
      var block_c = scan0_c + blockY * stride;

      for (var blockX = 0; blockX < w; blockX += 8, block_c += 8, block_p += 8) {
#if !TEST
        { // y
          var dct_temp_y = dct_temp;

          for (var y = 0; y < 8; y++) {
            for (var u = 0; u < 8; u++) {
              var pix_c_uv = block_c + u;
              var t = 0.0f;

              for (var v = 0; v < 8; v++) {
                t += *pix_c_uv * coefficientsArray[y * 8 + v];
                pix_c_uv += stride;
              }

              *(dct_temp_y++) = t;
            }
          }
        }

        { // x
          for (var y = 0; y < 8; y++) {
            var pix_p_xy = block_p + y * image.Stride;

            for (var x = 0; x < 8; x++) {
              var dct_temp_x = dct_temp + y * 8;
              var pix_p = 0.0f;

              for (var u = 0; u < 8; u++) {
                pix_p += dct_temp_x[u] * coefficientsArray[x * 8 + u];
              }

              pix_p_xy[x] = (byte)pix_p;
            }
          }
        }
#else
        for (var y = 0; y < 8; y++) {
          var pix_p_xy = block_p + y * image.Stride;

          for (var x = 0; x < 8; x++) {
            var pix_p = 0.0f;

            for (var v = 0; v < 8; v++) {
              var pix_c_uv = block_c + v * stride;

              for (var u = 0; u < 8; u++) {
                pix_p += pix_c_uv[u] * coefficientsArray[y * 8 + v] * coefficientsArray[x * 8 + u];
              }
            }

            pix_p_xy[x] = (byte)pix_p;
          }
        }
#endif
      }
    }
  }

  private static void ForwardDCT(bool simd, Bitmap luminance, IntPtr dctBuffer)
  {
    DCT(simd, true, luminance.Width, luminance.Height, luminance, dctBuffer);
  }

  private static Bitmap InverseDCT(bool simd, int width, int height, IntPtr dctBuffer)
  {
    var image = Create8bppGrayscaleBitmap(width, height);

    DCT(simd, false, width, height, image, dctBuffer);

    return image;
  }

  private static void DCT(bool simd, bool forward, int width, int height, Bitmap image, IntPtr dctBuffer)
  {
    var rect = new Rectangle(0, 0, width, height);

    BitmapData locked   = null;

    try {
      locked = image.LockBits(rect, forward ? ImageLockMode.ReadOnly : ImageLockMode.WriteOnly, Format8bppGrayscale);

      var stopwatch = new Stopwatch();

      stopwatch.Start();

      if (forward) {
        if (simd)
          ForwardDCTSimd(locked, dctBuffer, rect.Width, rect.Height);
        else
          ForwardDCTSisd(locked, dctBuffer, rect.Width, rect.Height);
      }
      else {
        if (simd)
          InverseDCTSimd(dctBuffer, locked, rect.Width, rect.Height);
        else
          InverseDCTSisd(dctBuffer, locked, rect.Width, rect.Height);
      }

      stopwatch.Stop();

      Console.WriteLine("{0} DCT spent {1}", forward ? "forward" : "inverse", stopwatch.Elapsed);
    }
    finally {
      if (locked != null)
        image.UnlockBits(locked);
    }
  }

  private static Bitmap CreateLuminanceImage(Bitmap imageColored)
  {
    var imageLuminance = Create8bppGrayscaleBitmap(imageColored.Width, imageColored.Height);

    BitmapData lockedColored   = null;
    BitmapData lockedLuminance = null;

    try {
      var rect = new Rectangle(0, 0, imageColored.Width, imageColored.Height);

      lockedColored   = imageColored  .LockBits(rect, ImageLockMode.ReadOnly,  PixelFormat.Format24bppRgb);
      lockedLuminance = imageLuminance.LockBits(rect, ImageLockMode.WriteOnly, Format8bppGrayscale);

      unsafe {
        for (var y = 0; y < imageColored.Height; y++) {
          var bgr = (byte*)  lockedColored.Scan0.ToPointer() + y * lockedColored.Stride;
          var lum = (byte*)lockedLuminance.Scan0.ToPointer() + y * lockedLuminance.Stride;

          for (var x = 0; x < imageColored.Width; x++) {
            // 0.299R + 0.587G + 0.114B
            *(lum++) = (byte)((*(bgr++) * 0114 / 1000) +
                              (*(bgr++) * 0587 / 1000) +
                              (*(bgr++) * 0299 / 1000));
          }
        }
      }

      return imageLuminance;
    }
    finally {
      if (lockedColored != null)
        imageColored.UnlockBits(lockedColored);
      if (lockedLuminance != null)
        imageLuminance.UnlockBits(lockedLuminance);
    }
  }

  private static Bitmap Create8bppGrayscaleBitmap(int width, int height)
  {
    var grayscaled = new Bitmap(width, height, Format8bppGrayscale);

    // 8bpp indexed as 8bpp grayscale
    var palette = grayscaled.Palette;

    for (var i = 0; i < palette.Entries.Length; i++) {
      palette.Entries[i] = Color.FromArgb(0xff, i, i, i);
    }

    grayscaled.Palette = palette;

    return grayscaled;
  }

  static void Main(string[] args)
  {
    var useSimd = (args[0] == "simd");

    if (useSimd) {
      Console.WriteLine("Mono.Simd accel mode: {0}", SimdRuntime.AccelMode);

      foreach (var method in new[] {
        new {Type = typeof(Vector4f), Method = "op_Addition", Signature = new[] {typeof(Vector4f), typeof(Vector4f)}},
        new {Type = typeof(Vector4f), Method = "op_Multiply", Signature = new[] {typeof(Vector4f), typeof(Vector4f)}},
      }) {
        Console.WriteLine("acceleration: {0}.{1} = {2}",
                          method.Type.Name,
                          method.Method,
                          SimdRuntime.IsMethodAccelerated(method.Type,
                                                          method.Method,
                                                          method.Signature));
      }
    }

    using (var image = new Bitmap(args[1])) {
      Console.WriteLine("image size {0}x{1}", image.Width, image.Height);

      var w = image.Width;
      var h = image.Height;

      if ((w & 7) != 0 || (h & 7) != 0)
        throw new ApplicationException("both width and height must be n * 8");

      InitializeCoefficients(useSimd);

      using (var luminance = CreateLuminanceImage(image)) {
        var dctBuffer = Marshal.AllocCoTaskMem(sizeof(float) * w * h);

        for (var act = 0; act < 5; act++) {
          ForwardDCT(useSimd, luminance, dctBuffer);

          using (var inverted = InverseDCT(useSimd, w, h, dctBuffer)) {
            if (act != 0)
              continue;

            luminance.Save(Path.Combine(args[2], "luminance.bmp"), ImageFormat.Bmp);
            inverted.Save(Path.Combine(args[2], "inverted.bmp"), ImageFormat.Bmp);
          }
        }

        Marshal.FreeCoTaskMem(dctBuffer);
      }
    }
  }
}