Mono.Simdを用いることで、SIMD命令を用いたベクタ演算をマネージドコードで記述できる。
二枚の8ビットARGB画像から飽和加算による加算合成画像を得る例。 4ピクセル分(16バイト)をVector16b構造体として扱い、VectorOperations.AddWithSaturationメソッドで飽和加算する。
// gmcs -unsafe -r:System,System.Drawing,Mono.Simd addition.cs using Mono.Simd; using System; using System.Diagnostics; using System.Drawing; using System.Drawing.Imaging; class Addition { private static unsafe void AddSimd(BitmapData result, BitmapData image1, BitmapData image2, int w, int h) { for (var y = 0; y < h; y++) { var resultPixel = (Vector16b*)((byte*)result.Scan0.ToPointer() + y * result.Stride); var image1Pixel = (Vector16b*)((byte*)image1.Scan0.ToPointer() + y * image1.Stride); var image2Pixel = (Vector16b*)((byte*)image2.Scan0.ToPointer() + y * image2.Stride); for (var x = 0; x < w; x += 4) { *(resultPixel++) = VectorOperations.AddWithSaturation(*(image1Pixel++), *(image2Pixel++)); } } } private static unsafe void AddSisd(BitmapData result, BitmapData image1, BitmapData image2, int w, int h) { for (var y = 0; y < h; y++) { var resultPixel = (byte*)result.Scan0.ToPointer() + y * result.Stride; var image1Pixel = (byte*)image1.Scan0.ToPointer() + y * image1.Stride; var image2Pixel = (byte*)image2.Scan0.ToPointer() + y * image2.Stride; for (var x = 0; x < w; x++) { int b = *(image1Pixel++) + *(image2Pixel++); int g = *(image1Pixel++) + *(image2Pixel++); int r = *(image1Pixel++) + *(image2Pixel++); int a = *(image1Pixel++) + *(image2Pixel++); *(resultPixel++) = (byte)((0xff < b) ? 0xff : b); *(resultPixel++) = (byte)((0xff < g) ? 0xff : g); *(resultPixel++) = (byte)((0xff < r) ? 0xff : r); *(resultPixel++) = (byte)((0xff < a) ? 0xff : a); } } } private static Bitmap Add(bool useSimd, int width, int height, Bitmap image1, Bitmap image2) { var result = new Bitmap(width, height,PixelFormat.Format32bppArgb); BitmapData lockedImage1 = null; BitmapData lockedImage2 = null; BitmapData lockedResult = null; try { var rect = new Rectangle(0, 0, width, height); lockedImage1 = image1.LockBits(rect, ImageLockMode.ReadOnly, PixelFormat.Format32bppArgb); lockedImage2 = image2.LockBits(rect, ImageLockMode.ReadOnly, PixelFormat.Format32bppArgb); lockedResult = result.LockBits(rect, ImageLockMode.WriteOnly, PixelFormat.Format32bppArgb); var stopwatch = new Stopwatch(); stopwatch.Start(); if (useSimd) AddSimd(lockedResult, lockedImage1, lockedImage2, width, height); else AddSisd(lockedResult, lockedImage1, lockedImage2, width, height); stopwatch.Stop(); Console.WriteLine("spent {0}", stopwatch.Elapsed); return result; } finally { if (lockedImage1 != null) image1.UnlockBits(lockedImage1); if (lockedImage2 != null) image2.UnlockBits(lockedImage2); if (lockedResult != null) result.UnlockBits(lockedResult); } } public static void Main(string[] args) { var useSimd = (args[0] == "simd"); if (useSimd) { Console.WriteLine("Mono.Simd accel mode: {0}", SimdRuntime.AccelMode); Console.WriteLine("acceleration: {0}", SimdRuntime.IsMethodAccelerated(typeof(VectorOperations), "AddWithSaturation", new[] {typeof(Vector16b), typeof(Vector16b)})); } using (Bitmap image1 = new Bitmap(args[1]), image2 = new Bitmap(args[2])) { var width = image1.Width; var height = image1.Height; Console.WriteLine("image size {0}x{1}", width, height); if (useSimd && (width & 3) != 0) throw new ApplicationException("width must be n * 4"); if (width != image2.Width || height != image2.Height) throw new ApplicationException("not same size"); for (var act = 0; act < 5; act++) { var result = Add(useSimd, width, height, image1, image2); if (act == 0) result.Save(args[3], ImageFormat.Bmp); result.Dispose(); } } } }
処理時間。
$ mono addition.exe sisd image1.bmp image2.bmp result.bmp image size 1280x720 spent 00:00:00.0129155 spent 00:00:00.0120989 spent 00:00:00.0121380 spent 00:00:00.0122019 spent 00:00:00.0121072
$ mono addition.exe simd image1.bmp image2.bmp result.bmp Mono.Simd accel mode: SSE1, SSE2, SSE3, SSSE3 acceleration: True image size 1280x720 spent 00:00:00.0036350 spent 00:00:00.0029663 spent 00:00:00.0030908 spent 00:00:00.0040916 spent 00:00:00.0030134
$ mono -O=-simd addition.exe simd image1.bmp image2.bmp result.bmp Mono.Simd accel mode: None acceleration: False image size 1280x720 spent 00:00:00.0187466 spent 00:00:00.0178911 spent 00:00:00.0177878 spent 00:00:00.0177766 spent 00:00:00.0177772
参考までに、実行した環境・CPUは以下のとおり。
$ mono -V Mono JIT compiler version 2.5 (/trunk/mono r129678 2009年 3月 18日 水曜日 19:49:22 JST) Copyright (C) 2002-2008 Novell, Inc and Contributors. www.mono-project.com TLS: __thread GC: Included Boehm (with typed GC and Parallel Mark) SIGSEGV: altstack Notifications: epoll Architecture: x86 Disabled: none $ cat /proc/cpuinfo processor : 0 vendor_id : GenuineIntel cpu family : 6 model : 15 model name : Intel(R) Core(TM)2 CPU 6600 @ 2.40GHz stepping : 6 cpu MHz : 1596.000 cache size : 4096 KB physical id : 0 siblings : 2 core id : 0 cpu cores : 2 apicid : 0 initial apicid : 0 fdiv_bug : no hlt_bug : no f00f_bug : no coma_bug : no fpu : yes fpu_exception : yes cpuid level : 10 wp : yes flags : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe nx lm constant_tsc arch_perfmon pebs bts pni monitor ds_cpl vmx est tm2 ssse3 cx16 xtpr lahf_lm bogomips : 4799.98 clflush size : 64 power management: processor : 1 vendor_id : GenuineIntel cpu family : 6 model : 15 model name : Intel(R) Core(TM)2 CPU 6600 @ 2.40GHz stepping : 6 cpu MHz : 1596.000 cache size : 4096 KB physical id : 0 siblings : 2 core id : 1 cpu cores : 2 apicid : 1 initial apicid : 1 fdiv_bug : no hlt_bug : no f00f_bug : no coma_bug : no fpu : yes fpu_exception : yes cpuid level : 10 wp : yes flags : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe nx lm constant_tsc arch_perfmon pebs bts pni monitor ds_cpl vmx est tm2 ssse3 cx16 xtpr lahf_lm bogomips : 4800.12 clflush size : 64 power management:
8x8 Forward DCTにより画像を変換し、変換したものを8x8 Inverse DCTで復元する例。 floatで表される4ピクセル分(16バイト)をVector4f構造体として扱い、DCT係数の積和を演算する。
まず処理時間。 使用したソースコードは後述。
$ mono dct.exe sisd image.bmp . image size 1280x720 forward DCT spent 00:00:00.0861134 inverse DCT spent 00:00:00.0645653 forward DCT spent 00:00:00.0815288 inverse DCT spent 00:00:00.0649667 forward DCT spent 00:00:00.0816030 inverse DCT spent 00:00:00.0668463 forward DCT spent 00:00:00.0815200 inverse DCT spent 00:00:00.0651595 forward DCT spent 00:00:00.0850448 inverse DCT spent 00:00:00.0655908
$ mono dct.exe simd image.bmp . Mono.Simd accel mode: SSE1, SSE2, SSE3, SSSE3 acceleration: Vector4f.op_Addition = True acceleration: Vector4f.op_Multiply = True image size 1280x720 forward DCT spent 00:00:00.0366700 inverse DCT spent 00:00:00.0326371 forward DCT spent 00:00:00.0348020 inverse DCT spent 00:00:00.0312227 forward DCT spent 00:00:00.0329452 inverse DCT spent 00:00:00.0329587 forward DCT spent 00:00:00.0489560 inverse DCT spent 00:00:00.0493336 forward DCT spent 00:00:00.0323098 inverse DCT spent 00:00:00.0338064
$ mono -O=-simd dct.exe simd image.bmp . Mono.Simd accel mode: None acceleration: Vector4f.op_Addition = False acceleration: Vector4f.op_Multiply = False image size 1280x720 forward DCT spent 00:00:00.1917066 inverse DCT spent 00:00:00.1856522 forward DCT spent 00:00:00.1853029 inverse DCT spent 00:00:00.1826358 forward DCT spent 00:00:00.1866373 inverse DCT spent 00:00:00.1803647 forward DCT spent 00:00:00.1832263 inverse DCT spent 00:00:00.1786115 forward DCT spent 00:00:00.1827668 inverse DCT spent 00:00:00.1789582
以下は検証に使用したコード。 SIMD版、SISD版ともにX方向とY方向のDCTを別々に演算することで積和の演算回数を減らしている(#define TESTを有効にすると、演算回数の多い実装が有効になる)。 また、SIMD版のDCTでは、処理しやすいようにマトリクスを転置してから積和を求めるようにしてある。
// gmcs -unsafe -r:System,System.Drawing,Mono.Simd dct.cs using Mono.Simd; using System; using System.Diagnostics; using System.Drawing; using System.Drawing.Imaging; using System.IO; using System.Runtime.InteropServices; class DCTSample { private const PixelFormat Format8bppGrayscale = PixelFormat.Format8bppIndexed; private static Vector4f[] coefficientsVector = null; private static Vector4f[] transposedCoefficientsVector = null; private static float[] coefficientsArray = null; private static void InitializeCoefficients(bool simd) { var coefficients = new float[8 * 8]; for (var y = 0; y < 8; y++) { for (var x = 0; x < 8; x++) { coefficients[y * 8 + x] = (x == 0) ? (float)(0.5 / Math.Sqrt(2.0)) : (float)(0.5 * Math.Cos((x * (2 * y + 1) * Math.PI) / 16.0)); } } if (simd) { coefficientsVector = new Vector4f[8 * 8 / 4]; transposedCoefficientsVector = new Vector4f[8 * 8 / 4]; for (var i = 0; i < 64; i += 4) { coefficientsVector[i / 4] = ArrayExtensions.GetVector(coefficients, i); } for (var i = 0; i < 8; i++) { transposedCoefficientsVector[i * 2 ] = new Vector4f(coefficients[i + 0], coefficients[i + 8], coefficients[i + 16], coefficients[i + 24]); transposedCoefficientsVector[i * 2 + 1] = new Vector4f(coefficients[i + 32], coefficients[i + 40], coefficients[i + 48], coefficients[i + 56]); } } else { coefficientsArray = coefficients; } } private static unsafe void ForwardDCTSimd(BitmapData image, IntPtr dctBuffer, int w, int h) { var dct_temp = stackalloc float[8 * 8]; var transposed = stackalloc float[8 * 8]; var stride = w; var scan0_p = (byte*)image.Scan0.ToPointer(); var scan0_c = (float*)dctBuffer.ToPointer(); for (var blockY = 0; blockY < h; blockY += 8) { var block_p = scan0_p + blockY * image.Stride; var block_c = scan0_c + blockY * stride; for (var blockX = 0; blockX < w; blockX += 8, block_c += 8, block_p += 8) { { // transpose matrix var pix_t = transposed; for (var tx = 0; tx < 8; tx++) { var pix_p_xy = block_p + tx; for (var ty = 0; ty < 8; ty++) { *(pix_t++) = (float)*pix_p_xy; pix_p_xy += image.Stride; } } } { // y var dct_temp_y = dct_temp; for (var v = 0; v < 8; v++) { var vec_p_xy = (Vector4f*)transposed; for (var x = 0; x < 8; x++) { var t = *(vec_p_xy++) * transposedCoefficientsVector[/* (v * 8 / 4) */ v * 2]; t += *(vec_p_xy++) * transposedCoefficientsVector[/* (v * 8 / 4) */ v * 2 + 1]; *(dct_temp_y++) = t.X + t.Y + t.Z + t.W; } } } { // x for (var v = 0; v < 8; v++) { var pix_c_uv = block_c + v * stride; var dct_temp_x = (Vector4f*)(dct_temp + v * 8); for (var u = 0; u < 8; u++) { var vec_c = dct_temp_x[0] * transposedCoefficientsVector[/* (u * 8 / 4) */ u * 2]; vec_c += dct_temp_x[1] * transposedCoefficientsVector[/* (u * 8 / 4) */ u * 2 + 1]; pix_c_uv[u] = vec_c.X + vec_c.Y + vec_c.Z + vec_c.W; } } } } } } private static unsafe void InverseDCTSimd(IntPtr dctBuffer, BitmapData image, int w, int h) { var dct_temp = stackalloc float[8 * 8]; var transposed = stackalloc float[8 * 8]; var stride = w; var scan0_p = (byte*)image.Scan0.ToPointer(); var scan0_c = (float*)dctBuffer.ToPointer(); for (var blockY = 0; blockY < h; blockY += 8) { var block_p = scan0_p + blockY * image.Stride; var block_c = scan0_c + blockY * stride; for (var blockX = 0; blockX < w; blockX += 8, block_c += 8, block_p += 8) { { // transpose matrix var pix_t = transposed; for (var tu = 0; tu < 8; tu++) { var pix_c_uv = block_c + tu; for (var tv = 0; tv < 8; tv++) { *(pix_t++) = *pix_c_uv; pix_c_uv += stride; } } } { // y var dct_temp_y = dct_temp; for (var y = 0; y < 8; y++) { var vec_c_uv = (Vector4f*)transposed; for (var u = 0; u < 8; u++) { var t = *(vec_c_uv++) * coefficientsVector[/* (y * 8 / 4) */ y * 2]; t += *(vec_c_uv++) * coefficientsVector[/* (y * 8 / 4) */ y * 2 + 1]; *(dct_temp_y++) = t.X + t.Y + t.Z + t.W; } } } { // x for (var y = 0; y < 8; y++) { var pix_p_xy = block_p + y * image.Stride; var dct_temp_x = (Vector4f*)(dct_temp + y * 8); for (var x = 0; x < 8; x++) { var vec_p = dct_temp_x[0] * coefficientsVector[/* (x * 8 / 4) */ x * 2]; vec_p += dct_temp_x[1] * coefficientsVector[/* (x * 8 / 4) */ x * 2 + 1]; pix_p_xy[x] = (byte)(vec_p.X + vec_p.Y + vec_p.Z + vec_p.W); } } } } } } private static unsafe void ForwardDCTSisd(BitmapData image, IntPtr dctBuffer, int w, int h) { #if !TEST var dct_temp = stackalloc float[8 * 8]; #endif var stride = w; var scan0_p = (byte*)image.Scan0.ToPointer(); var scan0_c = (float*)dctBuffer.ToPointer(); for (var blockY = 0; blockY < h; blockY += 8) { var block_p = scan0_p + blockY * image.Stride; var block_c = scan0_c + blockY * stride; for (var blockX = 0; blockX < w; blockX += 8, block_p += 8, block_c += 8) { #if !TEST { // y var dct_temp_y = dct_temp; for (var v = 0; v < 8; v++) { for (var x = 0; x < 8; x++) { var pix_p_xy = block_p + x; var t = 0.0f; for (var y = 0; y < 8; y++) { t += *pix_p_xy * coefficientsArray[y * 8 + v]; pix_p_xy += image.Stride; } *(dct_temp_y++) = t; } } } { // x for (var v = 0; v < 8; v++) { var pix_c_uv = block_c + v * stride; for (var u = 0; u < 8; u++) { var dct_temp_x = dct_temp + v * 8; var pix_c = 0.0f; for (var x = 0; x < 8; x++) { pix_c += dct_temp_x[x] * coefficientsArray[x * 8 + u]; } pix_c_uv[u] = pix_c; } } } #else for (var v = 0; v < 8; v++) { var pix_c_uv = block_c + v * stride; for (var u = 0; u < 8; u++) { var pix_c = 0.0f; for (var y = 0; y < 8; y++) { var pix_p_xy = block_p + y * image.Stride; for (var x = 0; x < 8; x++) { pix_c += pix_p_xy[x] * coefficientsArray[y * 8 + v] * coefficientsArray[x * 8 + u]; } } pix_c_uv[u] = pix_c; } } #endif } } } private static unsafe void InverseDCTSisd(IntPtr dctBuffer, BitmapData image, int w, int h) { #if !TEST var dct_temp = stackalloc float[8 * 8]; #endif var stride = w; var scan0_p = (byte*)image.Scan0.ToPointer(); var scan0_c = (float*)dctBuffer.ToPointer(); for (var blockY = 0; blockY < h; blockY += 8) { var block_p = scan0_p + blockY * image.Stride; var block_c = scan0_c + blockY * stride; for (var blockX = 0; blockX < w; blockX += 8, block_c += 8, block_p += 8) { #if !TEST { // y var dct_temp_y = dct_temp; for (var y = 0; y < 8; y++) { for (var u = 0; u < 8; u++) { var pix_c_uv = block_c + u; var t = 0.0f; for (var v = 0; v < 8; v++) { t += *pix_c_uv * coefficientsArray[y * 8 + v]; pix_c_uv += stride; } *(dct_temp_y++) = t; } } } { // x for (var y = 0; y < 8; y++) { var pix_p_xy = block_p + y * image.Stride; for (var x = 0; x < 8; x++) { var dct_temp_x = dct_temp + y * 8; var pix_p = 0.0f; for (var u = 0; u < 8; u++) { pix_p += dct_temp_x[u] * coefficientsArray[x * 8 + u]; } pix_p_xy[x] = (byte)pix_p; } } } #else for (var y = 0; y < 8; y++) { var pix_p_xy = block_p + y * image.Stride; for (var x = 0; x < 8; x++) { var pix_p = 0.0f; for (var v = 0; v < 8; v++) { var pix_c_uv = block_c + v * stride; for (var u = 0; u < 8; u++) { pix_p += pix_c_uv[u] * coefficientsArray[y * 8 + v] * coefficientsArray[x * 8 + u]; } } pix_p_xy[x] = (byte)pix_p; } } #endif } } } private static void ForwardDCT(bool simd, Bitmap luminance, IntPtr dctBuffer) { DCT(simd, true, luminance.Width, luminance.Height, luminance, dctBuffer); } private static Bitmap InverseDCT(bool simd, int width, int height, IntPtr dctBuffer) { var image = Create8bppGrayscaleBitmap(width, height); DCT(simd, false, width, height, image, dctBuffer); return image; } private static void DCT(bool simd, bool forward, int width, int height, Bitmap image, IntPtr dctBuffer) { var rect = new Rectangle(0, 0, width, height); BitmapData locked = null; try { locked = image.LockBits(rect, forward ? ImageLockMode.ReadOnly : ImageLockMode.WriteOnly, Format8bppGrayscale); var stopwatch = new Stopwatch(); stopwatch.Start(); if (forward) { if (simd) ForwardDCTSimd(locked, dctBuffer, rect.Width, rect.Height); else ForwardDCTSisd(locked, dctBuffer, rect.Width, rect.Height); } else { if (simd) InverseDCTSimd(dctBuffer, locked, rect.Width, rect.Height); else InverseDCTSisd(dctBuffer, locked, rect.Width, rect.Height); } stopwatch.Stop(); Console.WriteLine("{0} DCT spent {1}", forward ? "forward" : "inverse", stopwatch.Elapsed); } finally { if (locked != null) image.UnlockBits(locked); } } private static Bitmap CreateLuminanceImage(Bitmap imageColored) { var imageLuminance = Create8bppGrayscaleBitmap(imageColored.Width, imageColored.Height); BitmapData lockedColored = null; BitmapData lockedLuminance = null; try { var rect = new Rectangle(0, 0, imageColored.Width, imageColored.Height); lockedColored = imageColored .LockBits(rect, ImageLockMode.ReadOnly, PixelFormat.Format24bppRgb); lockedLuminance = imageLuminance.LockBits(rect, ImageLockMode.WriteOnly, Format8bppGrayscale); unsafe { for (var y = 0; y < imageColored.Height; y++) { var bgr = (byte*) lockedColored.Scan0.ToPointer() + y * lockedColored.Stride; var lum = (byte*)lockedLuminance.Scan0.ToPointer() + y * lockedLuminance.Stride; for (var x = 0; x < imageColored.Width; x++) { // 0.299R + 0.587G + 0.114B *(lum++) = (byte)((*(bgr++) * 0114 / 1000) + (*(bgr++) * 0587 / 1000) + (*(bgr++) * 0299 / 1000)); } } } return imageLuminance; } finally { if (lockedColored != null) imageColored.UnlockBits(lockedColored); if (lockedLuminance != null) imageLuminance.UnlockBits(lockedLuminance); } } private static Bitmap Create8bppGrayscaleBitmap(int width, int height) { var grayscaled = new Bitmap(width, height, Format8bppGrayscale); // 8bpp indexed as 8bpp grayscale var palette = grayscaled.Palette; for (var i = 0; i < palette.Entries.Length; i++) { palette.Entries[i] = Color.FromArgb(0xff, i, i, i); } grayscaled.Palette = palette; return grayscaled; } public static void Main(string[] args) { var useSimd = (args[0] == "simd"); if (useSimd) { Console.WriteLine("Mono.Simd accel mode: {0}", SimdRuntime.AccelMode); foreach (var method in new[] { new {Type = typeof(Vector4f), Method = "op_Addition", Signature = new[] {typeof(Vector4f), typeof(Vector4f)}}, new {Type = typeof(Vector4f), Method = "op_Multiply", Signature = new[] {typeof(Vector4f), typeof(Vector4f)}}, }) { Console.WriteLine("acceleration: {0}.{1} = {2}", method.Type.Name, method.Method, SimdRuntime.IsMethodAccelerated(method.Type, method.Method, method.Signature)); } } using (var image = new Bitmap(args[1])) { Console.WriteLine("image size {0}x{1}", image.Width, image.Height); var w = image.Width; var h = image.Height; if ((w & 7) != 0 || (h & 7) != 0) throw new ApplicationException("both width and height must be n * 8"); InitializeCoefficients(useSimd); using (var luminance = CreateLuminanceImage(image)) { var dctBuffer = Marshal.AllocCoTaskMem(sizeof(float) * w * h); for (var act = 0; act < 5; act++) { ForwardDCT(useSimd, luminance, dctBuffer); using (var inverted = InverseDCT(useSimd, w, h, dctBuffer)) { if (act != 0) continue; luminance.Save(Path.Combine(args[2], "luminance.bmp"), ImageFormat.Bmp); inverted.Save(Path.Combine(args[2], "inverted.bmp"), ImageFormat.Bmp); } } Marshal.FreeCoTaskMem(dctBuffer); } } } }