Project import generated by Copybara. NOKEYCHECK=True GitOrigin-RevId: dcbe0211d22b840a0543aea2f5415be7d645a689
diff --git a/BMP.c b/BMP.c new file mode 100644 index 0000000..cadb718 --- /dev/null +++ b/BMP.c
@@ -0,0 +1,1413 @@ + +/*============================================================================= + bmplib, a simple library to create, modify, and write BMP image files. + Copyright (C) 2009-2010 by Zack T Smith. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License version 2 + as published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + + The author may be reached at fbui@comcast.net. + *============================================================================*/ + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> + +#include "BMP.h" + +// Mini characters, 8 pixels high. +static char* mini_chars [] = +{ + "#", + "#", + "#", + "#", + "#", + " ", + "#", + "", + + "## ##", + " # #", + "# #", + " ", + " ", + " ", + " ", + "", + + " # # ", + " # # ", + "#####", + " # # ", + "#####", + " # # ", + " # # ", + "", + + " # ", + " ####", + "# # ", + " ### ", + " # #", + "####", + " # ", + "", + + "## #", + " #", + " #", + " #", + " #", + "#", + "# ##", + "", + + " # ", + "# # ", + "## ", + " ## #", + "# ## ", + "# # ", + " ## #", + "", + + "##", + " #", + "#", + "", + "", + "", + "", + "", + + " #", + "#", + "#", + "#", + "#", + "#", + "#", + " #", + + "# ", + " #", + " #", + " #", + " #", + " #", + " #", + "#", + + " ", + "# # #", + " ###", + " #", + " ###", + "# # #", + "", + "", + + " ", + " #", + " #", + "#####", + " #", + " #", + "", + "", + + " ", + "", + "", + "", + "", + "##", + " #", + "#", + + " ", + "", + "", + "#####", + "", + "", + "", + "", + + " ", + "", + "", + "", + "", + "", + "#", + "", + + " #", + " #", + " #", + " #", + " #", + "#", + "#", + "", + + " ## ", + "# #", + "# #", + "# #", + "# #", + "# #", + " ## ", + "", + + " #", + "##", + " #", + " #", + " #", + " #", + " #", + "", + + " ## ", + "# #", + " #", + " ###", + "# ", + "# ", + "####", + "", + + "####", + " #", + " # ", + " ## ", + " #", + "# #", + " ## ", + "", + + "# # ", + "# #", + "# #", + "####", + " #", + " #", + " #", + "", + + "####", + "# ", + "### ", + " #", + " #", + "# #", + " ## ", + "", + + " ## ", + "# ", + "# ", + "### ", + "# #", + "# #", + " ## ", + "", + + "####", + " #", + " #", + " # ", + " # ", + " # ", + " # ", + "", + + " ## ", + "# #", + "# #", + " ## ", + "# #", + "# #", + " ## ", + "", + + " ## ", + "# #", + "# #", + " ###", + " #", + " # ", + " # ", + "", + + " ", + "", + "", + "#", + "", + "#", + "", + "", + + " ", + "", + " ", + "##", + " ", + "##", + " #", + "#", + + " #", + " #", + " #", + "#", + " #", + " #", + " #", + "", + + " ", + "", + "", + "#####", + " ", + "#####", + "", + "", + + "# ", + " #", + " #", + " #", + " #", + " #", + "#", + "", + + " ### ", + "# #", + " #", + " ## ", + " #", + "", + " #", + "", + + " ### ", + "# #", + "# .##", + "# # #", + "# .##", + "# ", + " ###", + "", + + " # ", + " # # ", + "# #", + "# #", + "#####", + "# #", + "# #", + "", + + "#### ", + "# #", + "# #", + "#### ", + "# #", + "# #", + "####", + "", + + " ### ", + "# #", + "# ", + "# ", + "# ", + "# #", + " ###", + "", + + "#### ", + "# #", + "# #", + "# #", + "# #", + "# #", + "####", + "", + + "#####", + "#", + "#", + "###", + "#", + "#", + "#####", + "", + + "#####", + "# ", + "# ", + "###", + "# ", + "# ", + "#", + "", + + " ### ", + "# #", + "# ", + "# ##", + "# #", + "# #", + " ###.", + "", + + "# #", + "# #", + "# #", + "#####", + "# #", + "# #", + "# #", + "", + + "###", + " #", + " #", + " #", + " #", + " #", + "###", + "", + + " ###", + " #", + " #", + " #", + " #", + "# #", + " ##", + "", + + "# #", + "# #", + "# #", + "##", + "# #", + "# #", + "# #", + "", + + "# ", + "#", + "#", + "#", + "#", + "#", + "#####", + "", + + "# #", + "## ##", + "# # #", + "# #", + "# #", + "# #", + "# #", + "", + + "# #", + "## #", + "# # #", + "# ##", + "# #", + "# #", + "# #", + "", + + " ### ", + "# #", + "# #", + "# #", + "# #", + "# #", + " ###", + "", + + "#### ", + "# #", + "# #", + "#### ", + "# ", + "# ", + "# ", + "", + + " ### ", + "# #", + "# #", + "# #", + "# # #", + "# # ", + " ## #", + "", + + "#### ", + "# #", + "# #", + "#### ", + "# # ", + "# # ", + "# #", + "", + + " ### ", + "# #", + "# ", + " ### ", + " #", + "# #", + " ###", + "", + + "#####", + " #", + " #", + " #", + " #", + " #", + " #", + "", + + "# #", + "# #", + "# #", + "# #", + "# #", + "# #", + " ###", + "", + + "# #", + "# #", + "# #", + "# #", + "# #", + " # # ", + " #", + "", + + "# #", + "# #", + "# #", + "# . #", + "# # #", + "## ##", + "# #", + "", + + "# #", + "# #", + " # #", + " #", + " # #", + "# #", + "# #", + "", + + "# #", + "# #", + "# #", + " # #", + " #", + " #", + " #", + "", + + "#####", + " #", + " #", + " #", + " #", + "#", + "#####", + "", + + "##", + "#", + "#", + "#", + "#", + "#", + "#", + "##", + + "# ", + "#", + " #", + " #", + " #", + " #", + " #", + "", + + "##", + " #", + " #", + " #", + " #", + " #", + " #", + "##", + + " # ", + " #.#", + "# #", + "", + "", + "", + "", + "", + + " ", + "", + "", + "", + "", + "", + "", + "####", + + "##", + "#", + " #", + "", + "", + "", + "", + "", + + " ", + " ", + " ## ", + " #", + " ###", + "# #", + ".###", + "", + + "# ", + "# ", + "### ", + "# #", + "# #", + "# #", + "### ", + "", + + " ", + " ", + " ###", + "# ", + "# ", + "# ", + " ###", + "", + + " #", + " #", + " ###", + "# #", + "# #", + "# #", + " ###", + "", + + " ", + " ", + " ## ", + "# #", + "####", + "# ", + " ###", + "", + + " ##", + " # ", + "### ", + " # ", + " # ", + " # ", + "### ", + "", + + " ", + " ", + " ###", + "# #", + "# #", + " ###", + " #", + "### ", + + "# ", + "# ", + "### ", + "# #", + "# #", + "# #", + "# #", + "", + + " # ", + " ", + "## ", + " # ", + " # ", + " # ", + "###", + "", + + " #", + " ", + " ##", + " #", + " #", + " #", + " #", + "## ", + + "# ", + "# ", + "# #", + "# # ", + "## ", + "# # ", + "# #", + "", + + "## ", + " # ", + " # ", + " # ", + " # ", + " # ", + "###", + "", + + " ", + "", + "####", + "# # #", + "# # #", + "# # #", + "# # #", + "", + + " ", + " ", + "###", + "# #", + "# #", + "# #", + "# #", + "", + + " ", + " ", + " ## ", + "# #", + "# #", + "# #", + " ## ", + "", + + " ", + "", + "###", + "# #", + "# #", + "###", + "#", + "#", + + " ", + "", + " ###", + "# #", + "# #", + " ###", + " #", + " # ", + + " ", + " ", + "# ##", + "## ", + "# ", + "# ", + "# ", + "", + + " ", + " ", + " ###", + "# ", + " ##", + " #", + "### ", + "", + + " # ", + " #", + "###", + " #", + " #", + " #", + " ##", + "", + + " ", + "", + "# #", + "# #", + "# #", + "# #", + " ###", + "", + + " ", + "", + "# #", + "# #", + "# #", + " # #", + " #", + "", + + " ", + "", + "# # #", + "# # #", + "# # #", + "# # #", + " # #", + "", + + " ", + "", + "# #", + " # #", + " #", + " # #", + "# #", + "", + + " ", + " ", + "# #", + "# #", + "# #", + " ###", + " #", + "### ", + + " ", + "", + "#####", + " #", + " #", + " # ", + "#####", + "", + + +}; + + +// Narrowest possible numbers. +static char* narrow_nums [] = +{ + " # ", + "# #", + "# #", + "# #", + "# #", + "# #", + " # ", + + " #", + "##", + " #", + " #", + " #", + " #", + " #", + + " # ", + "# #", + " #", + " ##", + "# ", + "# ", + "###", + + "###", + " #", + " # ", + "## ", + " #", + "# #", + " # ", + + "# #", + "# #", + "# #", + "###", + " #", + " #", + " #", + + "###", + "# ", + "## ", + " #", + " #", + "# #", + " # ", + + + " # ", + "# ", + "# ", + "## ", + "# #", + "# #", + " # ", + + "###", + " #", + " #", + " # ", + " # ", + " # ", + " # ", + + " # ", + "# #", + "# #", + " # ", + "# #", + "# #", + " # ", + + " # ", + "# #", + "# #", + " ##", + " #", + " # ", + "# ", + + " ", + "", + "", + " ", + "", + "", + "#", +}; + + +/*=========================================================================== + * Name: BMP_new + * Purpose: Creates new image. + */ +BMP* +BMP_new (int w, int h) +{ + unsigned long size; + BMP* nu; + if (w<1 || h<1) + return NULL; + //---------- + + if (w & 3) + w += 4 - (w & 3); + if (h & 3) + h += 4 - (h & 3); + + nu = (BMP*) malloc (sizeof (BMP)); + if (!nu) + return NULL; + memset (nu, 0, sizeof (BMP)); + nu->width = w; + nu->height = h; + size = w * h * sizeof (long); + nu->pixels = (unsigned long*) malloc (size); + if (!nu->pixels) { + free (nu); + return NULL; + } + memset (nu->pixels, 0, size); + return nu; +} + +/*=========================================================================== + * Name: BMP_delete + * Purpose: Deallocates image. + */ +void +BMP_delete (BMP* bmp) +{ + if (!bmp) + return; + //---------- + + if (bmp->pixels) + free (bmp->pixels); + free (bmp); +} + +/*=========================================================================== + * Name: BMP_point + * Purpose: Writes pixel into image. + */ +void +BMP_point (BMP *bmp, int x, int y, unsigned long rgb) +{ + if (!bmp || x<0 || y<0) + return; + if (x >= bmp->width || y >= bmp->height) + return; + if (!bmp->pixels) + return; + //---------- + + bmp->pixels[y*bmp->width + x] = rgb; +} + +/*=========================================================================== + * Name: BMP_line + * Purpose: Draws a line in a BMP image. + */ +void +BMP_line (BMP *bmp, int x0, int y0, int x1, int y1, unsigned long rgb) +{ + if ((rgb >> 24) == 0xff) + return; + + if (x0 == x1 && y0 == y1) + BMP_point (bmp, x0, y0, rgb); + else if (x0 == x1) + BMP_vline (bmp, x0, y0, y1, rgb); + else if (y0 == y1) + BMP_hline (bmp, x0, x1, y0, rgb); + else { + int j, x, y, dx, dy, e, xchange, s1, s2; + + // DDA, copied from my FramebufferUI project. + + x = x0; + y = y0; + s1 = 1; + s2 = 1; + + dx = x1 - x0; + if (dx < 0) { + dx = -dx; + s1 = -1; + } + + dy = y1 - y0; + if (dy < 0) { + dy = -dy; + s2 = -1; + } + + xchange = 0; + + if (dy > dx) { + int tmp = dx; + dx = dy; + dy = tmp; + xchange = 1; + } + + e = (dy<<1) - dx; + j = 0; + + while (j <= dx) { + j++; + + BMP_point (bmp, x, y, rgb); + + if (e >= 0) { + if (xchange) + x += s1; + else + y += s2; + e -= (dx << 1); + } + if (xchange) + y += s2; + else + x += s1; + e += (dy << 1); + } + } +} + +/*=========================================================================== + * Name: BMP_rect + * Purpose: Fills a rectangle with a color. + */ +void +BMP_rect (BMP *bmp, int x, int y, int w, int h, unsigned long rgb) +{ + BMP_hline (bmp, x, x+w-1, y, rgb); + BMP_hline (bmp, x, x+w-1, y+h-1, rgb); + BMP_vline (bmp, x, y, y+h-1, rgb); + BMP_vline (bmp, x+w-1, y, y+h-1, rgb); +} + +/*=========================================================================== + * Name: BMP_fillrect + * Purpose: Fills a rectangle with a color. + */ +void +BMP_fillrect (BMP *bmp, int x, int y, int w, int h, unsigned long rgb) +{ + while (h > 0) { + BMP_hline (bmp, x, x+w-1, y, rgb); + h--; + y++; + } +} + +/*=========================================================================== + * Name: BMP_clear + * Purpose: Sets all pixels to specified color. + */ +void +BMP_clear (BMP *bmp, unsigned long rgb) +{ + BMP_fillrect (bmp, 0, 0, bmp->width, bmp->height, rgb); +} + +/*=========================================================================== + * Name: BMP_hline + * Purpose: Draws horizontal line. + */ +void +BMP_hline (BMP *bmp, int x0, int x1, int y, unsigned long rgb) +{ + if (x0 > x1) { + int tmp=x1; + x1=x0; + x0=tmp; + } + + while (x0 <= x1) { + BMP_point (bmp, x0++, y, rgb); + } +} + +/*=========================================================================== + * Name: BMP_vline + * Purpose: Draws vertical line. + */ +void +BMP_vline (BMP *bmp, int x, int y0, int y1, unsigned long rgb) +{ + if (y0 > y1) { + int tmp=y1; + y1=y0; + y0=tmp; + } + + while (y0 <= y1) { + BMP_point (bmp, x, y0++, rgb); + } +} + +/*=========================================================================== + * Name: BMP_draw_mini_string + * Purpose: Draws miniature 5x8 characters into the image. + * Note: Full ASCII character set not supported. + */ +int +BMP_draw_mini_string (BMP *bmp, char *string, int x, int y, unsigned long color) +{ + char ch, *s; + unsigned long r,g,b; + unsigned long light; + + if (!bmp || !string) + return 0; + if (x >= bmp->width || y >= bmp->height || !*string) + return 0; + //---------- + + r = 0xff & (color >> 16); + g = 0xff & (color >> 8); + b = 0xff & color; + r += 3*0xff; + b += 3*0xff; + g += 3*0xff; + r /= 4; + g /= 4; + b /= 4; + light = b | (g << 8) | (r << 16); + +#define MINI_HEIGHT (8) + s = string; + while ((ch = *s++)) { + int ix = -1; + if (ch == ' ') { + x += 5; + continue; + } + if (ch > 'z') + continue; + if (ch > ' ' && ch <= 'z') + ix = MINI_HEIGHT * (ch - 33); + + if (ix >= 0) { + int i; + int width = strlen (mini_chars[ix]); + + for (i=0; i<MINI_HEIGHT; i++) { + int j=0; + char ch2, *s2 = mini_chars[ix + i]; + while ((ch2 = *s2++)) { + switch (ch2) { + case '#': + BMP_point (bmp,x+j, y+i, color); + break; + case '.': + BMP_point (bmp,x+j, y+i, light); + break; + } + j++; + } + } + + x += width + 1; + } + } + + return x; +} + +/*=========================================================================== + * Name: BMP_mini_string_width + * Purpose: Gets width of miniature 5x8 characters. + * Note: Full ASCII character set not supported. + */ +int +BMP_mini_string_width (char *string) +{ + char ch, *s; + int width = 0; + + if (!string) + return 0; + //---------- + + s = string; + while ((ch = *s++)) { + int ix = -1; + if (ch == ' ') { + width += 5; + continue; + } + if (ch > 'z') + continue; + if (ch > ' ' && ch <= 'z') + ix = MINI_HEIGHT * (ch - 33); + + if (ix >= 0) { + int w = strlen (mini_chars[ix]); + + width += w + 1; + } + } + + return width; +} + +/*=========================================================================== + * Name: BMP_narrow_numbers + * Purpose: Draws miniature 4x7 characters into the image. + * Note: Full ASCII character set not supported. + */ +int +BMP_draw_narrow_numbers (BMP *bmp, char *string, int x, int y, unsigned long color) +{ + char ch, *s; + + if (!bmp || !string) + return 0; + if (x >= bmp->width || y >= bmp->height || !*string) + return 0; + //---------- + +#define NARROW_HEIGHT (7) + s = string; + while ((ch = *s++)) { + int ix = -1; + if (ch == ' ') { + x += 3; + continue; + } + if (ch >= '0' && ch <= '9') + ix = ch - '0'; + else + if (ch == '.') + ix = 10; + + ix *= NARROW_HEIGHT; + + if (ix >= 0) { + int i; + int width = strlen (narrow_nums [ix]); + + for (i=0; i<NARROW_HEIGHT; i++) { + int j=0; + char ch2, *s2 = narrow_nums [ix + i]; + while ((ch2 = *s2++)) { + if (ch2 == '#') { + BMP_point (bmp, + x+j, y+i, color); + } + j++; + } + } + + x += width + 1; + } + } + + return x; +} + +/*=========================================================================== + * Name: BMP_getpixel + * Purpose: Reads pixel out of image. + */ +unsigned long +BMP_getpixel (BMP *bmp, int x, int y) +{ + if (!bmp || x<0 || y<0) + return 0; + if (x >= bmp->width || y >= bmp->height) + return 0; + if (!bmp->pixels) + return 0; + //---------- + + return bmp->pixels[y*bmp->width + x]; +} + +/*=========================================================================== + * Name: BMP_write + * Purpose: Writes image to BMP file. + */ +int +BMP_write (BMP* bmp, char *path) +{ + FILE *f; +#define HDRLEN (54) + unsigned char h[HDRLEN]; + unsigned long len; + int i, j; + + if (!bmp || !path) + return -1; + //---------- + + memset (h, 0, HDRLEN); + + //---------------------------------------- + // Create the file. + // + f = fopen (path, "wb"); + if (!f) + return 0; + + //---------------------------------------- + // Prepare header + // + len = HDRLEN + 3 * bmp->width * bmp->height; + h[0] = 'B'; + h[1] = 'M'; + h[2] = len & 0xff; + h[3] = (len >> 8) & 0xff; + h[4] = (len >> 16) & 0xff; + h[5] = (len >> 24) & 0xff; + h[10] = HDRLEN; + h[14] = 40; + h[18] = bmp->width & 0xff; + h[19] = (bmp->width >> 8) & 0xff; + h[20] = (bmp->width >> 16) & 0xff; + h[22] = bmp->height & 0xff; + h[23] = (bmp->height >> 8) & 0xff; + h[24] = (bmp->height >> 16) & 0xff; + h[26] = 1; + h[28] = 24; + h[34] = 16; + h[36] = 0x13; // 2835 pixels/meter + h[37] = 0x0b; + h[42] = 0x13; // 2835 pixels/meter + h[43] = 0x0b; + + //---------------------------------------- + // Write header. + // + if (HDRLEN != fwrite (h, 1, HDRLEN, f)) { + fclose (f); + return 0; + } + + //---------------------------------------- + // Write pixels. + // Note that BMP has lower rows first. + // + for (j=bmp->height-1; j >= 0; j--) { + for (i=0; i < bmp->width; i++) { + unsigned char rgb[3]; + int ix = i + j * bmp->width; + unsigned long pixel = bmp->pixels[ix]; + rgb[0] = pixel & 0xff; + rgb[1] = (pixel >> 8) & 0xff; + rgb[2] = (pixel >> 16) & 0xff; + if (3 != fwrite (rgb, 1, 3, f)) { + fclose (f); + return 0; + } + } + } + + fclose (f); + return 1; +} + +
diff --git a/BMP.h b/BMP.h new file mode 100644 index 0000000..a86858a --- /dev/null +++ b/BMP.h
@@ -0,0 +1,66 @@ + +/*============================================================================= + bmplib, a simple library to create, modify, and write BMP image files. + Copyright (C) 2009 by Zack T Smith. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License version 2 + as published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + + The author may be reached at fbui@comcast.net. + *============================================================================*/ + +#ifndef _BMP_H +#define _BMP_H + +typedef struct { + int width, height; + unsigned long *pixels; +} BMP; + +#define MINIFONT_HEIGHT (8) + +extern BMP* BMP_new (int, int); +extern void BMP_delete (BMP*); +extern void BMP_clear (BMP*, unsigned long); +extern int BMP_write (BMP*, char *path); +extern void BMP_point (BMP*, int, int, unsigned long); +extern void BMP_line (BMP *, int x0, int y0, int x1, int y1, unsigned long); +extern void BMP_hline (BMP *, int x0, int x1, int y, unsigned long); +extern void BMP_vline (BMP *, int x, int y0, int y1, unsigned long); +extern void BMP_rect (BMP *, int x, int y, int w, int h, unsigned long); +extern void BMP_fillrect (BMP *, int x, int y, int w, int h, unsigned long); +extern unsigned long BMP_getpixel (BMP*, int, int); +extern int BMP_draw_mini_string (BMP *, char *, int x, int y, unsigned long); +extern int BMP_mini_string_width (char *); + +#define RGB_BLACK (0) +#define RGB_GRAY (0xc0c0c0) +#define RGB_RED (0xff0000) +#define RGB_MAGENTA (0xff00ff) +#define RGB_GREEN (0xff00) +#define RGB_DARKGREEN (0x6400) +#define RGB_BLUE (0xff) +#define RGB_WHITE (0xffffff) +#define RGB_YELLOW (0xffff00) +#define RGB_CYAN (0xffff) +#define RGB_NAVYBLUE (0x80) +#define RGB_ORANGE (0xffa500) +#define RGB_DARKORANGE (0xff8c00) +#define RGB_PURPLE (0xa020f0) +#define RGB_MAROON (0x800000) +#define RGB_SALMON (0xfa8072) +#define RGB_BRASS (0xc3a368) +#define RGB_LEMONYELLOW (0xfde910) + +#endif +
diff --git a/COPYING.txt b/COPYING.txt new file mode 100644 index 0000000..3912109 --- /dev/null +++ b/COPYING.txt
@@ -0,0 +1,340 @@ + GNU GENERAL PUBLIC LICENSE + Version 2, June 1991 + + Copyright (C) 1989, 1991 Free Software Foundation, Inc. + 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + Preamble + + The licenses for most software are designed to take away your +freedom to share and change it. By contrast, the GNU General Public +License is intended to guarantee your freedom to share and change free +software--to make sure the software is free for all its users. This +General Public License applies to most of the Free Software +Foundation's software and to any other program whose authors commit to +using it. (Some other Free Software Foundation software is covered by +the GNU Library General Public License instead.) You can apply it to +your programs, too. + + When we speak of free software, we are referring to freedom, not +price. Our General Public Licenses are designed to make sure that you +have the freedom to distribute copies of free software (and charge for +this service if you wish), that you receive source code or can get it +if you want it, that you can change the software or use pieces of it +in new free programs; and that you know you can do these things. + + To protect your rights, we need to make restrictions that forbid +anyone to deny you these rights or to ask you to surrender the rights. +These restrictions translate to certain responsibilities for you if you +distribute copies of the software, or if you modify it. + + For example, if you distribute copies of such a program, whether +gratis or for a fee, you must give the recipients all the rights that +you have. You must make sure that they, too, receive or can get the +source code. And you must show them these terms so they know their +rights. + + We protect your rights with two steps: (1) copyright the software, and +(2) offer you this license which gives you legal permission to copy, +distribute and/or modify the software. + + Also, for each author's protection and ours, we want to make certain +that everyone understands that there is no warranty for this free +software. If the software is modified by someone else and passed on, we +want its recipients to know that what they have is not the original, so +that any problems introduced by others will not reflect on the original +authors' reputations. + + Finally, any free program is threatened constantly by software +patents. We wish to avoid the danger that redistributors of a free +program will individually obtain patent licenses, in effect making the +program proprietary. To prevent this, we have made it clear that any +patent must be licensed for everyone's free use or not licensed at all. + + The precise terms and conditions for copying, distribution and +modification follow. + + GNU GENERAL PUBLIC LICENSE + TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION + + 0. This License applies to any program or other work which contains +a notice placed by the copyright holder saying it may be distributed +under the terms of this General Public License. The "Program", below, +refers to any such program or work, and a "work based on the Program" +means either the Program or any derivative work under copyright law: +that is to say, a work containing the Program or a portion of it, +either verbatim or with modifications and/or translated into another +language. (Hereinafter, translation is included without limitation in +the term "modification".) Each licensee is addressed as "you". + +Activities other than copying, distribution and modification are not +covered by this License; they are outside its scope. The act of +running the Program is not restricted, and the output from the Program +is covered only if its contents constitute a work based on the +Program (independent of having been made by running the Program). +Whether that is true depends on what the Program does. + + 1. You may copy and distribute verbatim copies of the Program's +source code as you receive it, in any medium, provided that you +conspicuously and appropriately publish on each copy an appropriate +copyright notice and disclaimer of warranty; keep intact all the +notices that refer to this License and to the absence of any warranty; +and give any other recipients of the Program a copy of this License +along with the Program. + +You may charge a fee for the physical act of transferring a copy, and +you may at your option offer warranty protection in exchange for a fee. + + 2. You may modify your copy or copies of the Program or any portion +of it, thus forming a work based on the Program, and copy and +distribute such modifications or work under the terms of Section 1 +above, provided that you also meet all of these conditions: + + a) You must cause the modified files to carry prominent notices + stating that you changed the files and the date of any change. + + b) You must cause any work that you distribute or publish, that in + whole or in part contains or is derived from the Program or any + part thereof, to be licensed as a whole at no charge to all third + parties under the terms of this License. + + c) If the modified program normally reads commands interactively + when run, you must cause it, when started running for such + interactive use in the most ordinary way, to print or display an + announcement including an appropriate copyright notice and a + notice that there is no warranty (or else, saying that you provide + a warranty) and that users may redistribute the program under + these conditions, and telling the user how to view a copy of this + License. (Exception: if the Program itself is interactive but + does not normally print such an announcement, your work based on + the Program is not required to print an announcement.) + +These requirements apply to the modified work as a whole. If +identifiable sections of that work are not derived from the Program, +and can be reasonably considered independent and separate works in +themselves, then this License, and its terms, do not apply to those +sections when you distribute them as separate works. But when you +distribute the same sections as part of a whole which is a work based +on the Program, the distribution of the whole must be on the terms of +this License, whose permissions for other licensees extend to the +entire whole, and thus to each and every part regardless of who wrote it. + +Thus, it is not the intent of this section to claim rights or contest +your rights to work written entirely by you; rather, the intent is to +exercise the right to control the distribution of derivative or +collective works based on the Program. + +In addition, mere aggregation of another work not based on the Program +with the Program (or with a work based on the Program) on a volume of +a storage or distribution medium does not bring the other work under +the scope of this License. + + 3. You may copy and distribute the Program (or a work based on it, +under Section 2) in object code or executable form under the terms of +Sections 1 and 2 above provided that you also do one of the following: + + a) Accompany it with the complete corresponding machine-readable + source code, which must be distributed under the terms of Sections + 1 and 2 above on a medium customarily used for software interchange; or, + + b) Accompany it with a written offer, valid for at least three + years, to give any third party, for a charge no more than your + cost of physically performing source distribution, a complete + machine-readable copy of the corresponding source code, to be + distributed under the terms of Sections 1 and 2 above on a medium + customarily used for software interchange; or, + + c) Accompany it with the information you received as to the offer + to distribute corresponding source code. (This alternative is + allowed only for noncommercial distribution and only if you + received the program in object code or executable form with such + an offer, in accord with Subsection b above.) + +The source code for a work means the preferred form of the work for +making modifications to it. For an executable work, complete source +code means all the source code for all modules it contains, plus any +associated interface definition files, plus the scripts used to +control compilation and installation of the executable. However, as a +special exception, the source code distributed need not include +anything that is normally distributed (in either source or binary +form) with the major components (compiler, kernel, and so on) of the +operating system on which the executable runs, unless that component +itself accompanies the executable. + +If distribution of executable or object code is made by offering +access to copy from a designated place, then offering equivalent +access to copy the source code from the same place counts as +distribution of the source code, even though third parties are not +compelled to copy the source along with the object code. + + 4. You may not copy, modify, sublicense, or distribute the Program +except as expressly provided under this License. Any attempt +otherwise to copy, modify, sublicense or distribute the Program is +void, and will automatically terminate your rights under this License. +However, parties who have received copies, or rights, from you under +this License will not have their licenses terminated so long as such +parties remain in full compliance. + + 5. You are not required to accept this License, since you have not +signed it. However, nothing else grants you permission to modify or +distribute the Program or its derivative works. These actions are +prohibited by law if you do not accept this License. Therefore, by +modifying or distributing the Program (or any work based on the +Program), you indicate your acceptance of this License to do so, and +all its terms and conditions for copying, distributing or modifying +the Program or works based on it. + + 6. Each time you redistribute the Program (or any work based on the +Program), the recipient automatically receives a license from the +original licensor to copy, distribute or modify the Program subject to +these terms and conditions. You may not impose any further +restrictions on the recipients' exercise of the rights granted herein. +You are not responsible for enforcing compliance by third parties to +this License. + + 7. If, as a consequence of a court judgment or allegation of patent +infringement or for any other reason (not limited to patent issues), +conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot +distribute so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you +may not distribute the Program at all. For example, if a patent +license would not permit royalty-free redistribution of the Program by +all those who receive copies directly or indirectly through you, then +the only way you could satisfy both it and this License would be to +refrain entirely from distribution of the Program. + +If any portion of this section is held invalid or unenforceable under +any particular circumstance, the balance of the section is intended to +apply and the section as a whole is intended to apply in other +circumstances. + +It is not the purpose of this section to induce you to infringe any +patents or other property right claims or to contest validity of any +such claims; this section has the sole purpose of protecting the +integrity of the free software distribution system, which is +implemented by public license practices. Many people have made +generous contributions to the wide range of software distributed +through that system in reliance on consistent application of that +system; it is up to the author/donor to decide if he or she is willing +to distribute software through any other system and a licensee cannot +impose that choice. + +This section is intended to make thoroughly clear what is believed to +be a consequence of the rest of this License. + + 8. If the distribution and/or use of the Program is restricted in +certain countries either by patents or by copyrighted interfaces, the +original copyright holder who places the Program under this License +may add an explicit geographical distribution limitation excluding +those countries, so that distribution is permitted only in or among +countries not thus excluded. In such case, this License incorporates +the limitation as if written in the body of this License. + + 9. The Free Software Foundation may publish revised and/or new versions +of the General Public License from time to time. Such new versions will +be similar in spirit to the present version, but may differ in detail to +address new problems or concerns. + +Each version is given a distinguishing version number. If the Program +specifies a version number of this License which applies to it and "any +later version", you have the option of following the terms and conditions +either of that version or of any later version published by the Free +Software Foundation. If the Program does not specify a version number of +this License, you may choose any version ever published by the Free Software +Foundation. + + 10. If you wish to incorporate parts of the Program into other free +programs whose distribution conditions are different, write to the author +to ask for permission. For software which is copyrighted by the Free +Software Foundation, write to the Free Software Foundation; we sometimes +make exceptions for this. Our decision will be guided by the two goals +of preserving the free status of all derivatives of our free software and +of promoting the sharing and reuse of software generally. + + NO WARRANTY + + 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY +FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN +OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES +PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED +OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS +TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE +PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, +REPAIR OR CORRECTION. + + 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING +WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR +REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, +INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING +OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED +TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY +YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER +PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE +POSSIBILITY OF SUCH DAMAGES. + + END OF TERMS AND CONDITIONS + + How to Apply These Terms to Your New Programs + + If you develop a new program, and you want it to be of the greatest +possible use to the public, the best way to achieve this is to make it +free software which everyone can redistribute and change under these terms. + + To do so, attach the following notices to the program. It is safest +to attach them to the start of each source file to most effectively +convey the exclusion of warranty; and each file should have at least +the "copyright" line and a pointer to where the full notice is found. + + <one line to give the program's name and a brief idea of what it does.> + Copyright (C) <year> <name of author> + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + + +Also add information on how to contact you by electronic and paper mail. + +If the program is interactive, make it output a short notice like this +when it starts in an interactive mode: + + Gnomovision version 69, Copyright (C) year name of author + Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. + This is free software, and you are welcome to redistribute it + under certain conditions; type `show c' for details. + +The hypothetical commands `show w' and `show c' should show the appropriate +parts of the General Public License. Of course, the commands you use may +be called something other than `show w' and `show c'; they could even be +mouse-clicks or menu items--whatever suits your program. + +You should also get your employer (if you work as a programmer) or your +school, if any, to sign a "copyright disclaimer" for the program, if +necessary. Here is a sample; alter the names: + + Yoyodyne, Inc., hereby disclaims all copyright interest in the program + `Gnomovision' (which makes passes at compilers) written by James Hacker. + + <signature of Ty Coon>, 1 April 1989 + Ty Coon, President of Vice + +This General Public License does not permit incorporating your program into +proprietary programs. If your program is a subroutine library, you may +consider it more useful to permit linking proprietary applications with the +library. If this is what you want to do, use the GNU Library General +Public License instead of this License.
diff --git a/README.txt b/README.txt new file mode 100644 index 0000000..5acb2ff --- /dev/null +++ b/README.txt
@@ -0,0 +1,145 @@ + +This is the Readme file for my program +called "bandwidth". + +Bandwidth is a benchmark that attempts to measure +primarily memory bandwidth. In December 2010 and +as of release 0.24a, I have extended 'bandwidth' +to measure network bandwidth as well. + +It's useful because hardware specifications are +sometimes incomplete or misleading. + +-------------------------------------------------- +MEMORY BANDWIDTH + +Bandwidth performs sequential and random +reads and writes of varying sizes. This permits +you to see in the numbers how each type of memory +is performing. So for instance when bandwidth +writes a 256-byte chunk, you know that because +caches are normally write-back, this chunk +will reside entirely in the L1 cache. Whereas +a 512 kB chunk will mainly reside in L2. + +You could run a non-artificial benchmark and +observe that a general performance number is lower +on that machine, but that conceals the cause. +So the purpose of this program is to help you +pinpoint the cause of a performance problem, +and determine whether it is memory related. +It also tells you the best-case scenario i.e. +the maximum bandwidth achieved using sequential, +128-bit memory accesses. + +Version 0.24 adds network bandwidth testing. + +Version 0.23 adds: +- Mac OS/X 64-bit support. +- Vector-to-vector register transfer test. +- Main register to/from vector register transfer test. +- Main register byte/word/dword/qword to/from + vector register test (pinsr*, pextr* instructions). +- Memory copy test using SSE2. +- Automatic checks under Linux for SSE2 & SSE4. + +Version 0.22 adds: +- Register-to-register transfer test. +- Register-to/from-stack transfer tests. + +Version 0.21 adds: +- Standardized memory chunks to always be + a multiple of 256-byte mini-chunks. +- Random memory accesses, in which each + 256-byte mini-chunk accessed is accessed + in a random order, but also, inside each + mini-chunk the 32/64/128 data are accessed + pseudo-randomly as well. +- Now 'bandwidth' includes chunk sizes that + are not powers of 2, which increases + data points around the key chunk sizes + corresponding to common L1 and L2 cache + sizes. +- Command-line options: + --quick for 0.25 seconds per test. + --slow for 20 seconds per test. + --title for adding a graph title. + +Version 0.20 added graphing, with the graph +stored in a BMP image file. It also adds the +--slow option for more precise runs. + +Version 0.19 added a second 128-bit SSE writer +routine that bypasses the caches, in addition +to the one that doesn't. + +Version 0.18 was my Grand Unified bandwidth +benchmark that brought together support for +four operating systems: + - Linux + - Windows Mobile + - 32-bit Windows + - Mac OS/X 64-bit +and three processor architectures: + - x86 + - Intel64 + - ARM +I've written custom assembly routines for +each architecture. + +Total run time for the default speed, which +has 5 seconds per test, is about 35 minutes. + +-------------------------------------------------- +NETWORK BANDWIDTH (beginning with release 0.24a) + +In December 2010, I extended bandwidth to measure +network bandwidth, which is useful for testing +your home or work network setup, and in theory +could be used to test larger networks as well. + +The network test is pretty simple. It sends chunks +of data of varying sizes to whatever computers +(nodes) that you specify. Each of those must be +running 'bandwidth' in transponder mode. + +The chunks of data range of 8 kB up to 32 MB. + +Sample output: + output/Network-Mac-Linux-Win32.txt + +How to start a transponder: + ./bandwidth-mac64 --transponder + +Example invocation of the test leader: + ./bandwidth64 --network 192.168.1.104 + +Areas for improvement: + +At present, the output of this test is not graphed. + +At present, the 'leader' in the test interacts +with the specified nodes by sending data but +not by receiving it. + +At present, the specified nodes do not interact +with one another as part of the test. + +At present, it is not known whether the network +code will work on ARM devices. +I've only tested it on + Linux 32-bit + Mac OS/X 32- and 64-bit + Win/Cygwin 32-bit. + +At present, it uses port 49000 but later +the port will be specifiable. + +-------------------------------------------------- +This program is provided without any warranty +and AS-IS. See the file COPYING for details. + +Zack Smith +fbui@comcast.net +December 2010 +
diff --git a/defs.h b/defs.h new file mode 100644 index 0000000..eb269bc --- /dev/null +++ b/defs.h
@@ -0,0 +1,94 @@ +/*============================================================================ + bandwidth 0.23, a benchmark to estimate memory transfer bandwidth. + Copyright (C) 2005-2010 by Zack T Smith. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + + The author may be reached at fbui@comcast.net. + *===========================================================================*/ + +//--------------------------------------------------------------------------- +// Change log +// 0.18 Grand unified version supports x86/intel64/arm, linux/win32/winmo. +// 0.19 Now have 128-bit writer that goes to cache AND one that bypasses. +// 0.20 Added my bmplib and graphing of output. Also added --slow option. +// 0.21 Adds random testing. Min chunk size = 256 B. Allows non-2^n chunks. +// 0.22 Adds register-to-register and register-to/from-stack transfers. +// 0.23 Adds vector-to-vector and register-to-vector transfers, & Mac support. +// 0.24 Adds network bandwidth tests from this PC to specified others. +//--------------------------------------------------------------------------- + +#ifndef _DEFS_H +#define _DEFS_H + +#define VERSION "0.24a" +#define VERSION_W L"0.24a" +#define APPNAME L"Bandwidth WinMo "VERSION_W + +#ifndef bool +typedef char bool; +enum { true = 1, false = 0 }; +#endif + +#define NETWORK_DEFAULT_PORTNUM (49000) +#define NETWORK_CHUNK_SIZE (8192) + +extern int Reader (void *ptr, unsigned long size, unsigned long loops); +extern int RandomReader (void *ptr, unsigned long n_chunks, unsigned long loops); + +extern int Writer (void *ptr, unsigned long size, unsigned long loops, unsigned long value); +extern int RandomWriter (void *ptr, unsigned long size, unsigned long loops, unsigned long value); + +extern int RegisterToRegister (unsigned long); + +extern int StackReader (unsigned long); +extern int StackWriter (unsigned long); + +#ifndef __arm__ +extern int RegisterToVector (unsigned long); // SSE2 +extern int Register8ToVector (unsigned long); // SSE2 +extern int Register16ToVector (unsigned long); // SSE2 +extern int Register32ToVector (unsigned long); // SSE2 +extern int Register64ToVector (unsigned long); // SSE2 + +extern int VectorToVector (unsigned long); // SSE2 + +extern int VectorToRegister (unsigned long); // SSE2 +extern int Vector8ToRegister (unsigned long); // SSE2 +extern int Vector16ToRegister (unsigned long); // SSE2 +extern int Vector32ToRegister (unsigned long); // SSE2 +extern int Vector64ToRegister (unsigned long); // SSE2 + +extern int CopySSE (void*, void*, unsigned long, unsigned long); // SSE2 +extern int Copy (void*, void*, unsigned long, unsigned long); + +extern int ReaderSSE2 (void *ptr, unsigned long, unsigned long); +extern int RandomReaderSSE2 (unsigned long **ptr, unsigned long, unsigned long); + +extern int WriterSSE2 (void *ptr, unsigned long, unsigned long, unsigned long); +extern int RandomWriterSSE2(unsigned long **ptr, unsigned long, unsigned long, unsigned long); + +extern int WriterSSE2_bypass (void *ptr, unsigned long, unsigned long, unsigned long); +extern int RandomWriterSSE2_bypass (unsigned long **ptr, unsigned long, unsigned long, unsigned long); + +extern int has_sse2 (); +#endif + +#define FBLOOPS_R 400 +#define FBLOOPS_W 800 +#define FB_SIZE (640*480*2) + +#endif +
diff --git a/main.c b/main.c new file mode 100644 index 0000000..8847a7f --- /dev/null +++ b/main.c
@@ -0,0 +1,2245 @@ +/*============================================================================ + bandwidth 0.24, a benchmark to estimate memory transfer bandwidth. + Copyright (C) 2005-2010 by Zack T Smith. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + + The author may be reached at fbui@comcast.net. + *===========================================================================*/ + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <time.h> +#include <sys/param.h> +#include <sys/types.h> +#include <sys/time.h> +#include <sys/stat.h> +#include <sys/ioctl.h> +#include <fcntl.h> +#include <unistd.h> +#include <wchar.h> +#include <math.h> + +#include <netdb.h> // gethostbyname +#include <sys/socket.h> +#include <netinet/in.h> +#include <arpa/inet.h> + +#include "defs.h" +#include "BMP.h" +#include "config.h" + +#ifdef __WIN32__ +#include <windows.h> +#endif + +#ifdef __linux__ +#include <linux/fb.h> +#include <sys/mman.h> +#endif + +#ifdef CONFIG_ARCH_S2L +#if defined(CONFIG_BSP_BOARD_S2LM_KIWI) || defined(CONFIG_BSP_BOARD_STRAWBERRY) +#define DRAM_SIZE_SMALL +#endif +#endif + +//---------------------------------------- +// Graphing data. +// +static char graph_title [500]; +#define TITLE "Results from bandwidth " VERSION " by Zack Smith, http://caladan.tk" +static BMP *graph; // Graph of results. +static int graph_width = 1280; +static int graph_height = 720; +static int graph_left_margin = 100; +static int graph_margin = 50; // top/bottom/right +static int graph_x_span = 1; +static int graph_y_span = 1; +static int graph_last_x = -1; +static int graph_last_y = -1; +static unsigned long graph_fg = RGB_BLACK; +static int legend_y; +#define MAX_GRAPH_DATA 5000 +static long graph_data [MAX_GRAPH_DATA]; +static int graph_data_index = 0; +enum { + DATUM_SIZE=0, + DATUM_AMOUNT=1, + DATUM_COLOR=2, +}; +static int max_bandwidth = 0; // Always 10 times the # of megabyte/sec. + +static bool use_sse2 = true; +static bool use_sse4 = true; + +//---------------------------------------- +// Parameters for the tests. +// +static long usec_per_test = 5000000; // 5 seconds per test. + +static int chunk_sizes[] = { + 256, + 512, + 768, + 1024, + 2048, + 3072, + 4096, + 6144, + 8192, // Some processors' L1 data caches are only 8kB. + 12288, + 16384, + 20480, + 24576, + 28672, + 32768, // Common L1 data cache size. + 40960, + 49152, + 65536, + 131072, // Old L2 cache size. + 192 * 1024, + 256 * 1024, // Old L2 cache size. + 384 * 1024, + 512 * 1024, // Old L2 cache size. + 768 * 1024, + 1 << 20, // 1 MB = common L2 cache size. + (1024 + 256) * 1024, // 1.25 + (1024 + 512) * 1024, // 1.5 + (1024 + 768) * 1024, // 1.75 + 1 << 21, // 2 MB = common L2 cache size. + (2048 + 256) * 1024, // 2.25 + (2048 + 512) * 1024, // 2.5 + (2048 + 768) * 1024, // 2.75 + 3072 * 1024, // 3 MB = common L2 cache sized. + 1 << 22, // 4 MB + 5242880, // 5 megs + 6291456, // 6 megs (std L2 cache size) +#if !defined(__arm__) && !defined(__aarch64__) + 7 * 1024 * 1024, // 7 megs + 8 * 1024 * 1024, + 16 * 1024 * 1024, + 64 * 1024 * 1024, +#endif + 0 +}; + +//---------------------------------------- +// Under CeGCC, the math.h log2() function +// turned out to be very inaccurate e.g. +// log2(8)=1.44, so I have here hard-coded +// the logarithms. +// +static double chunk_sizes_log2[] = +{ + 8, + 9, + 9.585, + 10, + 11, + 11.585, + 12, + 12.585, + 13, // 8 kB + 13.585, + 14, // 16 kB + 14.3219, // 20 kB + 14.585, // 24 kB + 14.8074, // 28 kB + 15, // 32 kB + 15.3219, // 40 kB + 15.585, // 48 kB + 16, // 64 kB + 17, // 128 kB + 17.585, // 192 kB + 18, // 256 kB + 18.585, // 385 kB + 19, // 512 kB + 19.585, // 768 kB + 20, // 1 MB + 20.3219, // 1.25 + 20.585, // 1.5 + 20.8074, // 1.75 + 21, // 2 MB + 21.1699, // 2.25 MB + 21.3219, // 2.5 MB + 21.4594, // 2.75 MB + 21.585, // 3 MB + 22, // 4 MB + 22.3219, + 22.585, +#if !defined(__arm__) && !defined(__aarch64__) + 22.8074, + 23, + 24, + 26, +#endif + 0 +}; + +static int min_chunk_size = 1; // These are determined in graph_draw_labels(). +static int max_chunk_size = 1; + +//---------------------------------------------------------------------------- +// Name: error +// Purpose: Complain and exit. +//---------------------------------------------------------------------------- +void error (char *s) +{ +#ifndef __WIN32__ + fprintf (stderr, "Error: %s\n", s); + exit (1); +#else + wchar_t tmp [200]; + int i; + for (i = 0; s[i]; i++) + tmp[i] = s[i]; + tmp[i] = 0; + MessageBoxW (0, tmp, L"Error", 0); + ExitProcess (0); +#endif +} + +void +dump_hex64 (unsigned long long value) +{ + unsigned long long v2 = value; + int i = 16; + while (i--) { + unsigned long long tmp = v2 >> 60; + unsigned int tmp2 = (unsigned int) tmp; + printf ("%1x", tmp2); + v2 <<= 4; + } +} + +//============================================================================ +// Graphing logic. +//============================================================================ + +//---------------------------------------------------------------------------- +// Name: graph_draw_labels +// Purpose: Draw the labels and ticks. +//---------------------------------------------------------------------------- +void +graph_draw_labels () +{ + int i; + + //---------------------------------------- + // Horizontal + // + //-------------------- + // Establish min & max. + // + min_chunk_size = 1000; + max_chunk_size = 0; + i = 0; + int j; + while ((j = chunk_sizes_log2 [i])) { + if (j < min_chunk_size) + min_chunk_size = j; + if (j > max_chunk_size) + max_chunk_size = j; + i++; + } + + for (i = min_chunk_size; i <= max_chunk_size; i++) { + char str[20]; + int x = graph_left_margin + + ((i-min_chunk_size) * graph_x_span) / + (max_chunk_size - min_chunk_size); + int y = graph_height - graph_margin + 10; + + unsigned long amt = 1 << i; + if (amt < 1024) + sprintf (str, "%ld B", amt); + else if (amt < (1<<20)) { + sprintf (str, "%ld kB", amt >> 10); + } + else { + j = amt >> 20; + switch ((amt >> 18) & 3) { + case 0: sprintf (str, "%d MB", j); break; + case 1: sprintf (str, "%d.25 MB", j); break; + case 2: sprintf (str, "%d.5 MB", j); break; + case 3: sprintf (str, "%d.75 MB", j); break; + } + } + + BMP_vline (graph, x, y, y-10, RGB_BLACK); + BMP_draw_mini_string (graph, str, x - 10, y+8, RGB_BLACK); + } + + //---------------------------------------- + // Vertical + // + for (i = 0; i <= (max_bandwidth/10000); i++) { + char str[20]; + int x = graph_left_margin - 10; + int y = graph_height - graph_margin - + (i * graph_y_span) / (max_bandwidth/10000); + + BMP_hline (graph, x, x+10, y, RGB_BLACK); + + sprintf (str, "%d GB/s", i); + BMP_draw_mini_string (graph, str, + x - 40, y - MINIFONT_HEIGHT/2, RGB_BLACK); + } +} + +void +graph_init () +{ + if (!graph) + return; + + BMP_clear (graph, RGB_WHITE); + + BMP_hline (graph, graph_left_margin, graph_width - graph_margin, + graph_height - graph_margin, RGB_BLACK); + BMP_vline (graph, graph_left_margin, graph_margin, + graph_height - graph_margin, RGB_BLACK); + + graph_x_span = graph_width - (graph_margin + graph_left_margin); + graph_y_span = graph_height - 2 * graph_margin; + + BMP_draw_mini_string (graph, graph_title, + graph_left_margin, graph_margin/2, RGB_BLACK); + + legend_y = graph_margin; +} + +void +graph_new_line (char *str, unsigned long color) +{ + BMP_draw_mini_string (graph, str, + graph_width - graph_margin - 200, legend_y, color); + + legend_y += 10; + + graph_fg = color; + graph_last_x = graph_last_y = -1; + + if (graph_data_index >= MAX_GRAPH_DATA-2) + error ("Too many graph data."); + + graph_data [graph_data_index++] = DATUM_COLOR; + graph_data [graph_data_index++] = (long) color; +} + +//---------------------------------------------------------------------------- +// Name: graph_add_point +// Purpose: Adds a point to this list to be drawn. +//---------------------------------------------------------------------------- +void +graph_add_point (int size, int amount) +{ + if (graph_data_index >= MAX_GRAPH_DATA-4) + error ("Too many graph data."); + + graph_data [graph_data_index++] = DATUM_SIZE; + graph_data [graph_data_index++] = size; + graph_data [graph_data_index++] = DATUM_AMOUNT; + graph_data [graph_data_index++] = amount; +} + +//---------------------------------------------------------------------------- +// Name: graph_plot +// Purpose: Plots a point on the current graph. +//---------------------------------------------------------------------------- +void +graph_plot (int size, int amount) +{ + //---------------------------------------- + // Get the log2 of the chunk size. + // We cannot rely on the libm math.h log2 + // function, because under CeGCC, + // log2(8) = 1.44. + // + int i = 0; + while (chunk_sizes [i] && chunk_sizes [i] != size) + i++; + if (!chunk_sizes [i]) + error ("Lookup of chunk size failed."); + double tmp = chunk_sizes_log2 [i]; + + //---------------------------------------- + // Plot the point. The x axis is + // logarithmic, base 2. + // + tmp -= (double) min_chunk_size; + tmp *= (double) graph_x_span; + tmp /= (double) (max_chunk_size - min_chunk_size); + + int x = graph_left_margin + (int) tmp; + int y = graph_height - graph_margin - + (amount * graph_y_span) / max_bandwidth; + +// Really I ought to save all data points, take max of everything, then plot. + + if (graph_last_x != -1 && graph_last_y != -1) { + BMP_line (graph, graph_last_x, graph_last_y, x, y, graph_fg); + } + + graph_last_x = x; + graph_last_y = y; +} + +//---------------------------------------------------------------------------- +// Name: graph_make +// Purpose: Plots all lines. +//---------------------------------------------------------------------------- +void +graph_make () +{ + int i; + + //---------------------------------------- + // Get the maximum bandwidth in order to + // properly scale the graph vertically. + // + max_bandwidth = 0; + for (i = 0; i < graph_data_index; i += 2) { + if (graph_data[i] == DATUM_AMOUNT) { + int amt = graph_data[i+1]; + if (amt > max_bandwidth) + max_bandwidth = amt; + } + } + max_bandwidth /= 10000; + max_bandwidth *= 10000; + max_bandwidth += 10000; + + graph_draw_labels (); + + //---------------------------------------- + // OK, now draw the lines. + // + int size = -1, amt = -1; + for (i = 0; i < graph_data_index; i += 2) + { + int type = graph_data[i]; + long value = graph_data[i+1]; + + switch (type) { + case DATUM_AMOUNT: amt = value; break; + case DATUM_SIZE: size = value; break; + case DATUM_COLOR: + graph_fg = (unsigned long) value; + graph_last_x = -1; + graph_last_y = -1; + break; + } + + if (amt != -1 && size != -1) { + graph_plot (size, amt); + amt = size = -1; + } + } +} + +//============================================================================ +// Output buffer logic. +//============================================================================ + +#define MSGLEN 10000 +static wchar_t msg [MSGLEN]; + +void print (wchar_t *s) +{ + wcscat (msg, s); +} + +void newline () +{ + wcscat (msg, L"\n"); +} + +void println (wchar_t *s) +{ + wcscat (msg, s); + newline (); +} + +void print_int (int d) +{ +#if defined(__WIN32__) && (defined(__arm__) || defined(__aarch64__)) + swprintf (msg + wcslen (msg), L"%d", d); +#else + swprintf (msg + wcslen (msg), MSGLEN, L"%d", d); +#endif +} + +void println_int (int d) +{ + print_int (d); + newline (); +} + +void print_result (long double result) +{ +#if defined(__WIN32__) && (defined(__arm__) || defined(__aarch64__)) + swprintf (msg + wcslen (msg), L"%.1Lf MB/s", result); +#else + swprintf (msg + wcslen (msg), MSGLEN, L"%.1Lf MB/s", result); +#endif +} + +void dump (FILE *f) +{ + if (!f) + f = stdout; + + int i = 0; + while (msg[i]) { + char ch = (char) msg[i]; + fputc (ch, f); + i++; + } + + msg [0] = 0; +} + +void flush () +{ +#if defined(__WIN32__) && (defined(__arm__) || defined(__aarch64__)) + MessageBeep (MB_OK); +#else + dump (NULL); + fflush (stdout); +#endif +} + +void print_size (unsigned long size) +{ + if (size < 1024) { + print_int (size); + print (L" B"); + } + else if (size < (1<<20)) { + print_int (size >> 10); + print (L" kB"); + } else { + print_int (size >> 20); + switch ((size >> 18) & 3) { + case 1: print (L".25"); break; + case 2: print (L".5"); break; + case 3: print (L".75"); break; + } + print (L" MB"); + } +} + +//============================================================================ +// Timing logic. +//============================================================================ + +//---------------------------------------------------------------------------- +// Name: mytime +// Purpose: Reports time in microseconds. +//---------------------------------------------------------------------------- +unsigned long mytime () +{ +#ifndef __WIN32__ + struct timeval tv; + struct timezone tz; + memset (&tz, 0, sizeof(struct timezone)); + gettimeofday (&tv, &tz); + return 1000000 * tv.tv_sec + tv.tv_usec; +#else + return 1000 * GetTickCount (); // accurate enough. +#endif +} + +//---------------------------------------------------------------------------- +// Name: calculate_result +// Purpose: Calculates and prints a result. +// Returns: 10 times the number of megabytes per second. +//---------------------------------------------------------------------------- +int +calculate_result (unsigned long chunk_size, long long total_count, long diff) +{ + if (!diff) + error ("Zero time difference."); + +// printf ("\nIn calculate_result, chunk_size=%ld, total_count=%lld, diff=%ld\n", chunk_size, total_count, diff); + long double result = (long double) chunk_size; + result *= (long double) total_count; + result *= 1000000.; + result /= 1048576.; + result /= (long double) diff; + + print_result (result); + + return (long) (10.0 * result); +} + +//============================================================================ +// Tests. +//============================================================================ + +//---------------------------------------------------------------------------- +// Name: do_write +// Purpose: Performs write on chunk of memory of specified size. +//---------------------------------------------------------------------------- +enum { + NO_SSE2, + SSE2, + SSE2_BYPASS, +}; +int +do_write (unsigned long size, int mode, bool random) +{ + unsigned char *chunk; + unsigned char *chunk0; + unsigned long loops; + unsigned long long total_count=0; +#if defined(__x86_64__) || defined(__aarch64__) + unsigned long value = 0x1234567689abcdef; +#else + unsigned long value = 0x12345678; +#endif + unsigned long diff=0, t0; + unsigned long tmp; + unsigned long **chunk_ptrs = NULL; + + if (size & 255) + error ("do_write(): chunk size is not multiple of 256."); + + //------------------------------------------------- + chunk0 = malloc (size+32); + chunk = chunk0; + if (!chunk) + error ("Out of memory"); + + tmp = (unsigned long) chunk; + if (tmp & 15) { + tmp -= (tmp & 15); + tmp += 16; + chunk = (unsigned char*) tmp; + } + + //---------------------------------------- + // Set up random pointers to chunks. + // + if (random) { + tmp = size/256; + chunk_ptrs = (unsigned long**) malloc (sizeof (unsigned long*) * tmp); + if (!chunk_ptrs) + error ("Out of memory."); + + //---------------------------------------- + // Store pointers to all chunks into array. + // + int i; + for (i = 0; i < tmp; i++) { + chunk_ptrs [i] = (unsigned long*) (chunk + 256 * i); + } + + //---------------------------------------- + // Randomize the array of chunk pointers. + // + int k = 100; + while (k--) { + for (i = 0; i < tmp; i++) { + int j = rand() % tmp; + if (i != j) { + unsigned long *ptr = chunk_ptrs [i]; + chunk_ptrs [i] = chunk_ptrs [j]; + chunk_ptrs [j] = ptr; + } + } + } + } + + //------------------------------------------------- + if (random) + print (L"Random write "); + else + print (L"Sequential write "); + + if (mode == SSE2) { + print (L"(128-bit), size = "); + } + else + if (mode == SSE2_BYPASS) { + print (L"bypassing cache (128-bit), size = "); + } else { +#if defined(__x86_64__) || defined(__aarch64__) + print (L"(64-bit), size = "); +#else + print (L"(32-bit), size = "); +#endif + } + + print_size (size); + print (L", "); + + loops = (1 << 26) / size;// XX need to adjust for CPU MHz + + t0 = mytime (); + + while (diff < usec_per_test) { + total_count += loops; + +#if !defined(__arm__) && !defined(__aarch64__) + if (mode == SSE2) { + if (random) + RandomWriterSSE2 (chunk_ptrs, size/256, loops, value); + else + WriterSSE2 (chunk, size, loops, value); + } + else + if (mode == SSE2_BYPASS) { + if (random) + RandomWriterSSE2_bypass (chunk_ptrs, size/256, loops, value); + else + WriterSSE2_bypass (chunk, size, loops, value); + } + else +#endif + if (random) + RandomWriter (chunk_ptrs, size/256, loops, value); + else + Writer (chunk, size, loops, value); + + diff = mytime () - t0; + } + + print (L"loops = "); + print_int (total_count); + print (L", "); + + flush (); + + int result = calculate_result (size, total_count, diff); + newline (); + + flush (); + + free ((void*)chunk0); + + if (chunk_ptrs) + free (chunk_ptrs); + + return result; +} + + +//---------------------------------------------------------------------------- +// Name: do_read +// Purpose: Performs sequential read on chunk of memory of specified size. +//---------------------------------------------------------------------------- +int +do_read (unsigned long size, bool use_sse2, bool random) +{ + unsigned long loops; + unsigned long long total_count = 0; + unsigned long t0, diff=0; + unsigned char *chunk; + unsigned char *chunk0; + unsigned long tmp; + unsigned long **chunk_ptrs = NULL; + + if (size & 255) + error ("do_read(): chunk size is not multiple of 256."); + + //------------------------------------------------- + chunk0 = chunk = malloc (size+32); + if (!chunk) + error ("Out of memory"); + + memset (chunk, 0, size); + + tmp = (unsigned long) chunk; + if (tmp & 15) { + tmp -= (tmp & 15); + tmp += 16; + chunk = (unsigned char*) tmp; + } + + //---------------------------------------- + // Set up random pointers to chunks. + // + if (random) { + int tmp = size/256; + chunk_ptrs = (unsigned long**) malloc (sizeof (unsigned long*) * tmp); + if (!chunk_ptrs) + error ("Out of memory."); + + //---------------------------------------- + // Store pointers to all chunks into array. + // + int i; + for (i = 0; i < tmp; i++) { + chunk_ptrs [i] = (unsigned long*) (chunk + 256 * i); + } + + //---------------------------------------- + // Randomize the array of chunk pointers. + // + int k = 100; + while (k--) { + for (i = 0; i < tmp; i++) { + int j = rand() % tmp; + if (i != j) { + unsigned long *ptr = chunk_ptrs [i]; + chunk_ptrs [i] = chunk_ptrs [j]; + chunk_ptrs [j] = ptr; + } + } + } + } + + //------------------------------------------------- + if (random) + print (L"Random read "); + else + print (L"Sequential read "); + + if (use_sse2) { + print (L"(128-bit), size = "); + } else { +#if defined(__x86_64__) || defined(__aarch64__) + print (L"(64-bit), size = "); +#else + print (L"(32-bit), size = "); +#endif + } + + print_size (size); + print (L", "); + + flush (); + + loops = (1 << 26) / size; // XX need to adjust for CPU MHz + + t0 = mytime (); + + while (diff < usec_per_test) { + total_count += loops; + +#if !defined(__arm__) && !defined(__aarch64__) + if (use_sse2) { + if (random) + RandomReaderSSE2 (chunk_ptrs, size/256, loops); + else + ReaderSSE2 (chunk, size, loops); + } + else +#endif + if (random) + RandomReader (chunk_ptrs, size/256, loops); + else + Reader (chunk, size, loops); + + diff = mytime () - t0; + } + + print (L"loops = "); + print_int (total_count); + print (L", "); + + int result = calculate_result (size, total_count, diff); + newline (); + + flush (); + + free (chunk0); + + if (chunk_ptrs) + free (chunk_ptrs); + + return result; +} + + + +//---------------------------------------------------------------------------- +// Name: do_copy +// Purpose: Performs sequential memory copy. +//---------------------------------------------------------------------------- +int +do_copy (unsigned long size, int mode) +{ + unsigned long loops; + unsigned long long total_count = 0; + unsigned long t0, diff=0; + unsigned char *chunk_src; + unsigned char *chunk_dest; + unsigned char *chunk_src0; + unsigned char *chunk_dest0; + unsigned long tmp; + + if (size & 255) + error ("do_copy(): chunk size is not multiple of 256."); + + //------------------------------------------------- + chunk_src0 = chunk_src = malloc (size+32); + if (!chunk_src) + error ("Out of memory"); + chunk_dest0 = chunk_dest = malloc (size+32); + if (!chunk_dest) + error ("Out of memory"); + + memset (chunk_src, 100, size); + memset (chunk_dest, 200, size); + + tmp = (unsigned long) chunk_src; + if (tmp & 15) { + tmp -= (tmp & 15); + tmp += 16; + chunk_src = (unsigned char*) tmp; + } + tmp = (unsigned long) chunk_dest; + if (tmp & 15) { + tmp -= (tmp & 15); + tmp += 16; + chunk_dest = (unsigned char*) tmp; + } + + //------------------------------------------------- + print (L"Sequential copy "); + + if (mode == SSE2) { + print (L"(128-bit), size = "); + } + else { +#if defined(__x86_64__) || defined(__aarch64__) + print (L"(64-bit), size = "); +#else + print (L"(32-bit), size = "); +#endif + } + + print_size (size); + print (L", "); + + flush (); + + loops = (1 << 26) / size; // XX need to adjust for CPU MHz + + t0 = mytime (); + + while (diff < usec_per_test) { + total_count += loops; + +#if !defined(__arm__) && !defined(__aarch64__) + if (mode == SSE2) + CopySSE (chunk_dest, chunk_src, size, loops); +#if 0 + else + Copy (chunk_dest, chunk_src, size, loops); +#endif +#endif + + diff = mytime () - t0; + } + + print (L"loops = "); + print_int (total_count); + print (L", "); + + int result = calculate_result (size, total_count, diff); + newline (); + + flush (); + + free (chunk_src0); + free (chunk_dest0); + + return result; +} + + +//---------------------------------------------------------------------------- +// Name: fb_readwrite +// Purpose: Performs sequential read & write tests on framebuffer memory. +//---------------------------------------------------------------------------- +#if defined(__linux__) && defined(FBIOGET_FSCREENINFO) +void +fb_readwrite (bool use_sse2) +{ + //unsigned long counter, total_count; + unsigned long total_count; + unsigned long length; + unsigned long diff, t0; + static struct fb_fix_screeninfo fi; + static struct fb_var_screeninfo vi; + unsigned long *fb = NULL; + //unsigned long datum; + int fd; + //register unsigned long foo; +#if defined(__x86_64__) || defined(__aarch64__) + unsigned long value = 0x1234567689abcdef; +#else + unsigned long value = 0x12345678; +#endif + + //------------------------------------------------- + + fd = open ("/dev/fb0", O_RDWR); + if (fd < 0) + fd = open ("/dev/fb/0", O_RDWR); + if (fd < 0) { + println (L"Cannot open framebuffer device."); + return; + } + + if (ioctl (fd, FBIOGET_FSCREENINFO, &fi)) { + close (fd); + println (L"Cannot get framebuffer info"); + return; + } + else + if (ioctl (fd, FBIOGET_VSCREENINFO, &vi)) { + close (fd); + println (L"Cannot get framebuffer info"); + return; + } + else + { + if (fi.visual != FB_VISUAL_TRUECOLOR && + fi.visual != FB_VISUAL_DIRECTCOLOR ) { + close (fd); + println (L"Need direct/truecolor framebuffer device."); + return; + } else { + unsigned long fblen; + + print (L"Framebuffer resolution: "); + print_int (vi.xres); + print (L"x"); + print_int (vi.yres); + print (L", "); + print_int (vi.bits_per_pixel); + println (L" bpp\n"); + + fb = (unsigned long*) fi.smem_start; + fblen = fi.smem_len; + + fb = mmap (fb, fblen, + PROT_WRITE | PROT_READ, + MAP_SHARED, fd, 0); + if (fb == MAP_FAILED) { + close (fd); + println (L"Cannot access framebuffer memory."); + return; + } + } + } + + //------------------- + // Use only the upper half of the display. + // + length = FB_SIZE; + + //------------------- + // READ + // + print (L"Framebuffer memory sequential read "); + flush (); + + t0 = mytime (); + + total_count = FBLOOPS_R; + +#if !defined(__arm__) && !defined(__aarch64__) + if (use_sse2) + ReaderSSE2 (fb, length, FBLOOPS_R); + else +#endif + Reader (fb, length, FBLOOPS_R); + + diff = mytime () - t0; + + calculate_result (length, total_count, diff); + newline (); + + //------------------- + // WRITE + // + print (L"Framebuffer memory sequential write "); + flush (); + + t0 = mytime (); + + total_count = FBLOOPS_W; + +#if !defined(__arm__) && !defined(__aarch64__) + if (use_sse2) + WriterSSE2_bypass (fb, length, FBLOOPS_W, value); + else +#endif + Writer (fb, length, FBLOOPS_W, value); + + diff = mytime () - t0; + + calculate_result (length, total_count, diff); + newline (); +} +#endif + +//---------------------------------------------------------------------------- +// Name: register_test +// Purpose: Determines bandwidth of register-to-register transfers. +//---------------------------------------------------------------------------- +void +register_test () +{ + long long total_count = 0; + unsigned long t0; + unsigned long diff = 0; + + //-------------------------------------- +#if defined(__x86_64__) || defined(__aarch64__) + print (L"Main register to main register transfers (64-bit) "); +#else + print (L"Main register to main register transfers (32-bit) "); +#endif + flush (); +#define REGISTER_COUNT 10000 + + t0 = mytime (); + while (diff < usec_per_test) + { + RegisterToRegister (REGISTER_COUNT); + total_count += REGISTER_COUNT; + + diff = mytime () - t0; + } + + calculate_result (256, total_count, diff); + newline (); + flush (); + +#if !defined(__arm__) && !defined(__aarch64__) + //-------------------------------------- +#ifdef __x86_64__ + print (L"Main register to vector register transfers (64-bit) "); +#else + print (L"Main register to vector register transfers (32-bit) "); +#endif + flush (); +#define VREGISTER_COUNT 3333 + + t0 = mytime (); + diff = 0; + total_count = 0; + while (diff < usec_per_test) + { + RegisterToVector (VREGISTER_COUNT); + total_count += VREGISTER_COUNT; + + diff = mytime () - t0; + } + + calculate_result (256, total_count, diff); + newline (); + flush (); + + //-------------------------------------- +#ifdef __x86_64__ + print (L"Vector register to main register transfers (64-bit) "); +#else + print (L"Vector register to main register transfers (32-bit) "); +#endif + flush (); + + t0 = mytime (); + diff = 0; + total_count = 0; + while (diff < usec_per_test) + { + VectorToRegister (VREGISTER_COUNT); + total_count += VREGISTER_COUNT; + + diff = mytime () - t0; + } + + calculate_result (256, total_count, diff); + newline (); + flush (); + + //-------------------------------------- + print (L"Vector register to vector register transfers (128-bit) "); + flush (); + + t0 = mytime (); + diff = 0; + total_count = 0; + while (diff < usec_per_test) + { + VectorToVector (VREGISTER_COUNT); + total_count += VREGISTER_COUNT; + + diff = mytime () - t0; + } + + calculate_result (256, total_count, diff); + newline (); + flush (); + + //-------------------------------------- + if (use_sse4) { + print (L"Vector 8-bit datum to main register transfers "); + flush (); + + t0 = mytime (); + diff = 0; + total_count = 0; + while (diff < usec_per_test) + { + Vector8ToRegister (VREGISTER_COUNT); + total_count += VREGISTER_COUNT; + + diff = mytime () - t0; + } + + calculate_result (256, total_count, diff); + newline (); + flush (); + } + + //-------------------------------------- + print (L"Vector 16-bit datum to main register transfers "); + flush (); + + t0 = mytime (); + diff = 0; + total_count = 0; + while (diff < usec_per_test) + { + Vector16ToRegister (VREGISTER_COUNT); + total_count += VREGISTER_COUNT; + + diff = mytime () - t0; + } + + calculate_result (256, total_count, diff); + newline (); + flush (); + + //-------------------------------------- + if (use_sse4) { + print (L"Vector 32-bit datum to main register transfers "); + flush (); + + t0 = mytime (); + diff = 0; + total_count = 0; + while (diff < usec_per_test) + { + Vector32ToRegister (VREGISTER_COUNT); + total_count += VREGISTER_COUNT; + + diff = mytime () - t0; + } + + calculate_result (256, total_count, diff); + newline (); + flush (); + } + +#ifdef __x86_64__ + //-------------------------------------- + print (L"Vector 64-bit datum to main register transfers "); + flush (); + + t0 = mytime (); + diff = 0; + total_count = 0; + while (diff < usec_per_test) + { + Vector64ToRegister (VREGISTER_COUNT); + total_count += VREGISTER_COUNT; + + diff = mytime () - t0; + } + + calculate_result (256, total_count, diff); + newline (); + flush (); +#endif + + //-------------------------------------- + if (use_sse4) { + print (L"Main register 8-bit datum to vector register transfers "); + flush (); + + t0 = mytime (); + diff = 0; + total_count = 0; + while (diff < usec_per_test) + { + Register8ToVector (VREGISTER_COUNT); + total_count += VREGISTER_COUNT; + + diff = mytime () - t0; + } + + calculate_result (256, total_count, diff); + newline (); + flush (); + } + + //-------------------------------------- + print (L"Main register 16-bit datum to vector register transfers "); + flush (); + + t0 = mytime (); + diff = 0; + total_count = 0; + while (diff < usec_per_test) + { + Register16ToVector (VREGISTER_COUNT); + total_count += VREGISTER_COUNT; + + diff = mytime () - t0; + } + + calculate_result (256, total_count, diff); + newline (); + flush (); + + //-------------------------------------- + if (use_sse4) { + print (L"Main register 32-bit datum to vector register transfers "); + flush (); + + t0 = mytime (); + diff = 0; + total_count = 0; + while (diff < usec_per_test) + { + Register32ToVector (VREGISTER_COUNT); + total_count += VREGISTER_COUNT; + + diff = mytime () - t0; + } + + calculate_result (256, total_count, diff); + newline (); + flush (); + } + +#ifdef __x86_64__ + //-------------------------------------- + print (L"Main register 64-bit datum to vector register transfers "); + flush (); + + t0 = mytime (); + diff = 0; + total_count = 0; + while (diff < usec_per_test) + { + Register64ToVector (VREGISTER_COUNT); + total_count += VREGISTER_COUNT; + + diff = mytime () - t0; + } + + calculate_result (256, total_count, diff); + newline (); + flush (); +#endif +#endif +} + +//---------------------------------------------------------------------------- +// Name: stack_test +// Purpose: Determines bandwidth of stack-to/from-register transfers. +//---------------------------------------------------------------------------- +void +stack_test () +{ + long long total_count = 0; + unsigned long t0; + unsigned long diff = 0; + +#if defined(__x86_64__) || defined(__aarch64__) + print (L"Stack-to-register transfers (64-bit) "); +#else + print (L"Stack-to-register transfers (32-bit) "); +#endif + flush (); + + //-------------------------------------- + diff = 0; + total_count = 0; + t0 = mytime (); + while (diff < usec_per_test) + { + StackReader (REGISTER_COUNT); + total_count += REGISTER_COUNT; + + diff = mytime () - t0; + } + + calculate_result (256, total_count, diff); + newline (); + flush (); + +#if defined(__x86_64__) || defined(__aarch64__) + print (L"Register-to-stack transfers (64-bit) "); +#else + print (L"Register-to-stack transfers (32-bit) "); +#endif + flush (); + + //-------------------------------------- + diff = 0; + total_count = 0; + t0 = mytime (); + while (diff < usec_per_test) + { + StackWriter (REGISTER_COUNT); + total_count += REGISTER_COUNT; + + diff = mytime () - t0; + } + + calculate_result (256, total_count, diff); + newline (); + flush (); +} + +//---------------------------------------------------------------------------- +// Name: library_test +// Purpose: Performs C library tests (memset, memcpy). +//---------------------------------------------------------------------------- +void +library_test () +{ + char *a1, *a2; + unsigned long t, t0; + int i; + + +#if defined(__WIN32__) && (defined(__arm__) || defined(__aarch64__)) + #define NT_SIZE (1024*1024) + #define NT_SIZE2 (50) +#elif !defined(__WIN32__) && (defined(__arm__) || defined(__aarch64__)) +#if defined(DRAM_SIZE_SMALL) + #define NT_SIZE (16*1024*1024) +#else + #define NT_SIZE (32*1024*1024) +#endif + #define NT_SIZE2 (50) +#else + #define NT_SIZE (64*1024*1024) + #define NT_SIZE2 (100) +#endif + + a1 = malloc (NT_SIZE); + if (!a1) + error ("Out of memory"); + + a2 = malloc (NT_SIZE); + if (!a2) + error ("Out of memory"); + + //-------------------------------------- + t0 = mytime (); + for (i=0; i<NT_SIZE2; i++) { + memset (a1, i, NT_SIZE); + } + t = mytime (); + + print (L"Library: memset "); + calculate_result (NT_SIZE, NT_SIZE2, t-t0); + newline (); + + flush (); + + //-------------------------------------- + t0 = mytime (); + for (i=0; i<NT_SIZE2; i++) { + memcpy (a2, a1, NT_SIZE); + } + t = mytime (); + + print (L"Library: memcpy "); + calculate_result (NT_SIZE, NT_SIZE2, t-t0); + newline (); + + flush (); + + free (a1); + free (a2); +} + +//---------------------------------------------------------------------------- +// Name: network_test_core +// Purpose: Performs the network test, talking to and receiving data +// back from a transponder node. +// Note: Port number specified using server:# notation. +// Returns: -1 on error, else the network duration in microseconds. +//---------------------------------------------------------------------------- +long +network_test_core (const char *net_path, char *chunk, + unsigned long chunk_size, + unsigned long count) +{ + char hostname [PATH_MAX]; + char *s; + int port = NETWORK_DEFAULT_PORTNUM ; + strcpy (hostname, net_path); + if ((s = strchr (hostname, ':'))) { + *s++ = 0; + port = atoi (s); + } + + struct hostent* host = gethostbyname (hostname); + if (!host) + return -1; + + char *host_ip = inet_ntoa (*(struct in_addr *)*host->h_addr_list); + int sock = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP); + + struct sockaddr_in addr; + addr.sin_family = AF_INET; + addr.sin_addr.s_addr = inet_addr(host_ip); + addr.sin_port = htons(port); + + if (connect (sock, (struct sockaddr*) &addr, sizeof (struct sockaddr))) + { + // perror ("connect"); + close (sock); + return -1; + } + + //------------------------------------ + // Send all of our data. + // + unsigned long t0 = mytime (); + int i; + for (i = 0; i < count; i++) + send (sock, chunk, chunk_size, 0); + +#if 0 + //------------------------------------ + // Set nonblocking mode. + // + int opt = 1; + ioctl (sock, FIONBIO, &opt); +#endif + + //------------------------------------ + // Read the response. + // + char *buffer = malloc (chunk_size); + if (!buffer) { + close (sock); + // perror ("malloc"); + return -1; + } + int amount = recv (sock, buffer, chunk_size, 0); + if (amount <= 0) { + close (sock); + //perror ("recv"); + return -1; + } + + long t = mytime () - t0; + close (sock); + free (buffer); + return t; +} + +//---------------------------------------------------------------------------- +// Name: ip_to_str +//---------------------------------------------------------------------------- +void +ip_to_str (unsigned long addr, char *str) +{ + if (!str) + return; + + unsigned short a = 0xff & addr; + unsigned short b = 0xff & (addr >> 8); + unsigned short c = 0xff & (addr >> 16); + unsigned short d = 0xff & (addr >> 24); + sprintf (str, "%u.%u.%u.%u", a,b,c,d); +} + +//---------------------------------------------------------------------------- +// Name: network_transponder +// Purpose: Act as a transponder, receiving chunks of data and sending +// back an acknowledgement once the enture chunk is read. +// Returns: False if a problem occurs setting up the network socket. +//---------------------------------------------------------------------------- +bool +network_transponder () +{ + struct sockaddr_in sin, from; + + //------------------------------ + // Get listening socket for port. + // Then listen on given port#. + // + sin.sin_family = AF_INET; + sin.sin_addr.s_addr = htonl(INADDR_ANY); + sin.sin_port = htons(NETWORK_DEFAULT_PORTNUM); + int listensock; + if ((listensock = socket (AF_INET, SOCK_STREAM, 0)) < 0) { + perror ("socket"); + return false; + } + if (bind (listensock, (struct sockaddr*) &sin, sizeof(sin)) < 0) { + perror ("bind"); + close (listensock); + return false; + } + if (listen (listensock, 500) < 0) { + perror ("listen"); + close (listensock); + return false; + } + + bool done = false; + while (!done) { + //---------------------------------------- + // Wait for a client to contact us. + // + socklen_t len = sizeof (struct sockaddr); + int sock = accept (listensock, (struct sockaddr*) &from, &len); + if (sock < 0) { + perror ("accept"); + close (listensock); + return false; + } + + if (len != sizeof (struct sockaddr_in)) { + close (sock); + close (listensock); + return false; + } + +#if 0 + unsigned long ipaddr = from.sin_addr.s_addr; + char ipstring[30]; + ip_to_str (ipaddr, ipstring); + fprintf (stderr, "Incoming connection from %s\n", ipstring); +#endif + + char chunk [NETWORK_CHUNK_SIZE+1]; + long n_chunks = 0; + int amount_read = read (sock, chunk, NETWORK_CHUNK_SIZE); + chunk [amount_read] = 0; + if (1 != sscanf (chunk, "%ld", &n_chunks)) { + close (sock); + close (listensock); + return false; + } + + //---------------------------------------- + // If the leader sends us a chunk count of + // -99, this indicates that we should exit. + // + if (n_chunks == -99) { + close (sock); + close (listensock); + return true; + } + +// printf ("Reading %lu chunks of %d bytes...\n", n_chunks, NETWORK_CHUNK_SIZE); + + unsigned long long remaining = n_chunks; + remaining *= NETWORK_CHUNK_SIZE; + +// printf ("remaining="); dump_hex64(remaining); puts(""); + + remaining -= amount_read; + while (remaining > 0) { + amount_read = read (sock, chunk, NETWORK_CHUNK_SIZE); + remaining -= amount_read; + + if (amount_read < 0) { + perror ("read"); + break; + } else + if (!amount_read) + break; + } + + char *foo = "OK.\n\n"; + write (sock, foo, 4); + close (sock); + } + + return true; +} + +//---------------------------------------------------------------------------- +// Name: network_test +//---------------------------------------------------------------------------- +bool +network_test (char **destinations, int n_destinations) +{ + int i; + + //---------------------------------------- + // The memory chunk starts with a 12-byte + // length of the overall send size. + // The memory chunk will have a list of + // the destinations in it. + // In future, there will be a mechanism + // for testing bandwidth between all nodes, + // not just the leader & each of the + // transponders. + // + char chunk [NETWORK_CHUNK_SIZE]; + memset (chunk, 0, NETWORK_CHUNK_SIZE); + sprintf (chunk, "000000000000\n%d\n", n_destinations); + for (i = 0; i < n_destinations; i++) { + char *s = destinations [i]; + int chunk_len = strlen (chunk); + int len = strlen (s); + if (len + chunk_len < NETWORK_CHUNK_SIZE-1) { + //---------------------------------------- + // "transp" indicates that the given node + // has not yet been a leader. + // In future, "done" will indicate it has. + // + sprintf (chunk + chunk_len, "%s %s\n", s, "transp"); + } + } + + //---------------------------------------- + // For each destination, run the test. + // + for (i = 0; i < n_destinations; i++) { + int j = 0; + bool problem = false; + + char *hostname = destinations[i]; + printf ("Bandwidth sending to %s:\n", hostname); + + //---------------------------------------- + // Send from 8kB up to 32 MB of data. + // + while (!problem && j < 13) { + unsigned long chunk_count = 1 << j; + unsigned long long amt_to_send = chunk_count; + amt_to_send *= NETWORK_CHUNK_SIZE; + + if (!amt_to_send) // unlikely + break; + + //---------------------------------------- + // Write the overall send size into the + // 1st line of the chunk so that the + // transponder knows how large the send + // is without guessing. + // + sprintf (chunk, "%11lu", chunk_count); + chunk[11] = ' '; + + //-------------------- + // Send the data. + // + long duration = network_test_core (hostname, + chunk, NETWORK_CHUNK_SIZE, chunk_count); + if (duration == -1) { + problem = true; + fprintf (stderr, "\nCan't connect to %s\n", hostname); + } else { + unsigned long amt_in_kb = amt_to_send / 1024; + unsigned long amt_in_mb = amt_to_send / 1048576; + if (!amt_in_mb) { + printf ("\tSent %lu kB...", amt_in_kb); + } else { + printf ("\tSent %lu MB...", amt_in_mb); + } + + //------------------------------ + // Calculate rate in MB/sec. + // + // Get total # bytes. + unsigned long long tmp = NETWORK_CHUNK_SIZE; + tmp *= chunk_count; + + // Get total bytes per second. + tmp *= 1000000; + tmp /= duration; + + // Bytes to megabytes. + tmp /= 1000; + tmp /= 10; + unsigned long whole = tmp / 100; + unsigned long frac = tmp % 100; + printf ("%lu.%02lu MB/second\n", whole, frac); + } + j++; + } + + puts (""); + } + + return true; +} + +//---------------------------------------------------------------------------- +// Name: usage +//---------------------------------------------------------------------------- +void +usage () +{ + printf ("Usage for memory tests: bandwidth [--slow] [--title string]\n"); + printf ("Usage for starting network tests: bandwidth --network <ipaddr1> [<ipaddr2...]\n"); + printf ("Usage for receiving network tests: bandwidth --transponder\n"); + exit (0); +} + +//---------------------------------------------------------------------------- +// Name: main +//---------------------------------------------------------------------------- +int +main (int argc, char **argv) +{ + int i, chunk_size; + + --argc; + ++argv; + + strcpy (graph_title, TITLE); + + bool network_mode = false; + bool network_leader = false; // false => transponder + int network_destinations_size = 0; + int n_network_destinations = 0; + char **network_destinations = NULL; + + i = 0; + while (i < argc) { + char *s = argv [i++]; + if (!strcmp ("--network", s)) { + network_mode = true; + network_leader = true; + network_destinations_size = 20; + network_destinations = (char**) malloc (network_destinations_size * sizeof (char*)); + } + else + if (!strcmp ("--transponder", s)) { + network_mode = true; + } + else + if (!strcmp ("--slow", s)) { + usec_per_test=20000000; // 20 seconds per test. + } + else + if (!strcmp ("--quick", s)) { + usec_per_test = 250000; // 0.25 seconds per test. + } + else + if (!strcmp ("--nosse2", s)) { + use_sse2 = false; + use_sse4 = false; + } + else + if (!strcmp ("--nosse4", s)) { + use_sse4 = false; + } + else + if (!strcmp ("--help", s)) { + usage (); + } + else + if (!strcmp ("--title", s) && i != argc) { + sprintf (graph_title, "%s -- %s", TITLE, argv[i++]); + } + else { + if (!network_mode || !network_leader) + usage (); + + if ('-' == *s) + usage (); + + if (n_network_destinations >= network_destinations_size) { + network_destinations_size *= 2; + int newsize = sizeof(char*) * network_destinations_size; + network_destinations = realloc (network_destinations, + newsize); + } + + network_destinations [n_network_destinations++] = strdup (s); + } + } + + msg[0] = 0; + +#if !(defined(__WIN32__) && (defined(__arm__) || defined(__aarch64__))) + printf ("This is bandwidth version %s.\n", VERSION); + printf ("Copyright (C) 2005-2010 by Zack T Smith.\n\n"); + printf ("This software is covered by the GNU Public License.\n"); + printf ("It is provided AS-IS, use at your own risk.\n"); + printf ("See the file COPYING for more information.\n\n"); + fflush (stdout); +#else + println (L"(C) 2010 by Zack Smith"); + println (L"Under GNU Public License"); + println (L"Use at your own risk."); +#endif + + //---------------------------------------- + // If network mode selected, enter it now. + // Currently cannot combine memory tests + // & network tests. + // + if (network_mode) { + if (network_leader) { + network_test (network_destinations, n_network_destinations); + } else { + network_transponder (); + } + + puts ("Done."); + return 0; + } + +#if !defined(__arm__) && !defined(__aarch64__) + if (!has_sse2 ()) { + puts ("Processor does not have SSE2."); + use_sse2 = false; + use_sse4 = false; + } + +#ifdef __x86_64__ + if (use_sse2) + println (L"Using 128-bit and 64-bit data transfers."); + else + println (L"Using 64-bit data transfers."); +#else + if (use_sse2) + println (L"Using 128-bit and 32-bit data transfers."); + else + println (L"Using 32-bit data transfers."); +#endif + +#else + +#if defined(__aarch64__) + println (L"Using 64-bit transfers."); +#else + println (L"Using 32-bit transfers."); +#endif + + use_sse2 = false; +#endif + + println (L"Notation: kB = 1024 B, MB = 1048576 B."); + + flush (); + + //------------------------------------------------------------ + // Attempt to obtain information about the CPU. + // +#ifdef __linux__ + struct stat st; + if (!stat ("/proc/cpuinfo", &st)) { +#define TMPFILE "/tmp/bandw_tmp" + unlink (TMPFILE); + if (-1 == system ("grep MHz /proc/cpuinfo | uniq | sed \"s/[\\t\\n: a-zA-Z]//g\" > "TMPFILE)) + perror ("system"); + + FILE *f = fopen (TMPFILE, "r"); + if (f) { + float cpu_speed = 0.0; + + if (1 == fscanf (f, "%g", &cpu_speed)) { + puts (""); + printf ("CPU speed is %g MHz.\n", cpu_speed); + } + fclose (f); + } + +#if !defined(__arm__) && !defined(__aarch64__) + unlink (TMPFILE); + if (-1 == system ("grep -i sse4 /proc/cpuinfo > "TMPFILE)) + perror ("system"); + + if (!stat (TMPFILE, &st)) { + if (st.st_size < 2) { + use_sse4 = false; + puts ("Processor lacks SSE4."); + } + } + + if (!use_sse2) { + unlink (TMPFILE); + if (-1 == system ("grep -i sse2 /proc/cpuinfo > "TMPFILE)) + perror ("system"); + + if (!stat (TMPFILE, &st)) { + if (st.st_size < 2) { + use_sse2 = false; + puts ("Processor lacks SSE2."); + } + } + } +#endif + } else { + printf ("CPU information is not available (/proc/cpuinfo).\n"); + } + fflush (stdout); +#endif + + graph = BMP_new (graph_width, graph_height); + graph_init (); + +#if !defined(__arm__) && !defined(__aarch64__) + //------------------------------------------------------------ + // SSE2 sequential reads. + // + if (use_sse2) { + graph_new_line ("Sequential 128-bit reads", RGB_RED); + + newline (); + + i = 0; + while ((chunk_size = chunk_sizes [i++])) { + int amount = do_read (chunk_size, true, false); + + graph_add_point (chunk_size, amount); + } + } + + //------------------------------------------------------------ + // SSE2 random reads. + // + if (use_sse2) { + graph_new_line ("Random 128-bit reads", RGB_MAROON); + + newline (); + srand (time (NULL)); + + i = 0; + while ((chunk_size = chunk_sizes [i++])) { + int amount = do_read (chunk_size, true, true); + + graph_add_point (chunk_size, amount); + } + } + + //------------------------------------------------------------ + // SSE2 sequential writes that do not bypass the caches. + // + if (use_sse2) { + graph_new_line ("Sequential 128-bit cache writes", RGB_PURPLE); + + newline (); + + i = 0; + while ((chunk_size = chunk_sizes [i++])) { + int amount = do_write (chunk_size, SSE2, false); + + graph_add_point (chunk_size, amount); + } + } + + //------------------------------------------------------------ + // SSE2 random writes that do not bypass the caches. + // + if (use_sse2) { + graph_new_line ("Random 128-bit cache writes", RGB_NAVYBLUE); + + newline (); + srand (time (NULL)); + + i = 0; + while ((chunk_size = chunk_sizes [i++])) { + int amount = do_write (chunk_size, SSE2, true); + + graph_add_point (chunk_size, amount); + } + } + + //------------------------------------------------------------ + // SSE2 sequential writes that do bypass the caches. + // + if (use_sse2) { + graph_new_line ("Sequential 128-bit bypassing writes", RGB_DARKORANGE); + + newline (); + + i = 0; + while ((chunk_size = chunk_sizes [i++])) { + int amount = do_write (chunk_size, SSE2_BYPASS, false); + + graph_add_point (chunk_size, amount); + } + } + + //------------------------------------------------------------ + // SSE2 random writes that bypass the caches. + // + if (use_sse2) { + graph_new_line ("Random 128-bit bypassing writes", RGB_LEMONYELLOW); + + newline (); + srand (time (NULL)); + + i = 0; + while ((chunk_size = chunk_sizes [i++])) { + int amount = do_write (chunk_size, SSE2_BYPASS, true); + + graph_add_point (chunk_size, amount); + } + } +#endif + + //------------------------------------------------------------ + // Sequential non-SSE2 reads. + // + newline (); +#if defined(__x86_64__) || defined(__aarch64__) + graph_new_line ("Sequential 64-bit reads", RGB_BLUE); +#else + graph_new_line ("Sequential 32-bit reads", RGB_BLUE); +#endif + + i = 0; + while ((chunk_size = chunk_sizes [i++])) { + int amount = do_read (chunk_size, false, false); + + graph_add_point (chunk_size, amount); + } + + //------------------------------------------------------------ + // Random non-SSE2 reads. + // + newline (); +#if defined(__x86_64__) || defined(__aarch64__) + graph_new_line ("Random 64-bit reads", RGB_CYAN); +#else + graph_new_line ("Random 32-bit reads", RGB_CYAN); +#endif + srand (time (NULL)); + + i = 0; + while ((chunk_size = chunk_sizes [i++])) { + int amount = do_read (chunk_size, false, true); + + graph_add_point (chunk_size, amount); + } + + //------------------------------------------------------------ + // Sequential non-SSE2 writes. + // +#if defined(__x86_64__) || defined(__aarch64__) + graph_new_line ("Sequential 64-bit writes", RGB_DARKGREEN); +#else + graph_new_line ("Sequential 32-bit writes", RGB_DARKGREEN); +#endif + + newline (); + + i = 0; + while ((chunk_size = chunk_sizes [i++])) { + int amount = do_write (chunk_size, NO_SSE2, false); + + graph_add_point (chunk_size, amount); + } + + //------------------------------------------------------------ + // Random non-SSE2 writes. + // +#if defined(__x86_64__) || defined(__aarch64__) + graph_new_line ("Random 64-bit writes", RGB_GREEN); +#else + graph_new_line ("Random 32-bit writes", RGB_GREEN); +#endif + + newline (); + srand (time (NULL)); + + i = 0; + while ((chunk_size = chunk_sizes [i++])) { + int amount = do_write (chunk_size, NO_SSE2, true); + + graph_add_point (chunk_size, amount); + } + +#if !defined(__arm__) && !defined(__aarch64__) + //------------------------------------------------------------ + // SSE2 sequential copy. + // + if (use_sse2) { + graph_new_line ("Sequential 128-bit copy", 0x8f8844); + + newline (); + + i = 0; + while ((chunk_size = chunk_sizes [i++])) { + int amount = do_copy (chunk_size, SSE2); + + graph_add_point (chunk_size, amount); + } + } +#endif + + //------------------------------------------------------------ + // Register to register. + // + newline (); + register_test (); + + //------------------------------------------------------------ + // Stack to/from register. + // + newline (); + stack_test (); + + //------------------------------------------------------------ + // C library performance. + // + newline (); + library_test (); + + //------------------------------------------------------------ + // Framebuffer read & write. + // +#if defined(__linux__) && defined(FBIOGET_FSCREENINFO) + newline (); + fb_readwrite (true); +#endif + +#if defined(__WIN32__) && (defined(__arm__) || defined(__aarch64__)) + MessageBoxW (0, msg, APPNAME, 0); + + FILE *of = fopen ("bandwidth.log", "w"); + if (of) { + dump (of); + fclose (of); + } +#else + flush (); +#endif + + graph_make (); + + BMP_write (graph, "bandwidth.bmp"); + BMP_delete (graph); +#if defined(__linux__) || defined(__CYGWIN__) || defined(__APPLE__) + puts ("\nWrote graph to bandwidth.bmp."); + puts (""); + puts ("Done."); +#endif + + return 0; +}
diff --git a/main_thread.c b/main_thread.c new file mode 100644 index 0000000..99e6078 --- /dev/null +++ b/main_thread.c
@@ -0,0 +1,2379 @@ +/*============================================================================ + bandwidth 0.24, a benchmark to estimate memory transfer bandwidth. + Copyright (C) 2005-2010 by Zack T Smith. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + + The author may be reached at fbui@comcast.net. + *===========================================================================*/ + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <time.h> +#include <sys/param.h> +#include <sys/types.h> +#include <sys/time.h> +#include <sys/stat.h> +#include <sys/ioctl.h> +#include <fcntl.h> +#include <unistd.h> +#include <wchar.h> +#include <math.h> +#include <pthread.h> + +#include <netdb.h> // gethostbyname +#include <sys/socket.h> +#include <netinet/in.h> +#include <arpa/inet.h> + +#include "defs.h" +#include "BMP.h" +#include "config.h" + +#ifdef __WIN32__ +#include <windows.h> +#endif + +#ifdef __linux__ +#include <linux/fb.h> +#include <sys/mman.h> +#endif + +#ifdef CONFIG_ARCH_S2L +#if defined(CONFIG_BSP_BOARD_S2LM_KIWI) || defined(CONFIG_BSP_BOARD_STRAWBERRY) +#define DRAM_SIZE_SMALL +#endif +#endif + +//---------------------------------------- +// Graphing data. +// +static char graph_title [500]; +#define TITLE "Results from bandwidth " VERSION " by Zack Smith, http://caladan.tk" +static BMP *graph; // Graph of results. +static int graph_width = 1280; +static int graph_height = 720; +static int graph_left_margin = 100; +static int graph_margin = 50; // top/bottom/right +static int graph_x_span = 1; +static int graph_y_span = 1; +static int graph_last_x = -1; +static int graph_last_y = -1; +static unsigned long graph_fg = RGB_BLACK; +static int legend_y; +#define MAX_GRAPH_DATA 5000 +static long graph_data [MAX_GRAPH_DATA]; +static int graph_data_index = 0; +enum { + DATUM_SIZE=0, + DATUM_AMOUNT=1, + DATUM_COLOR=2, +}; +static int max_bandwidth = 0; // Always 10 times the # of megabyte/sec. + +static bool use_sse2 = true; +static bool use_sse4 = true; + +static int goon_flag = 1; +static int thread_num = 4; +static int chunk_index = 0; +static int cpu_num = 0; + +struct thread_params { + int id; + unsigned long size; + bool random; + unsigned long **chunk_ptrs; + unsigned char *chunk; + unsigned long loops; +}; + +//---------------------------------------- +// Parameters for the tests. +// +static long usec_per_test = 5000000; // 5 seconds per test. + +static int chunk_sizes[] = { + 256, + 512, + 768, + 1024, + 2048, + 3072, + 4096, + 6144, + 8192, // Some processors' L1 data caches are only 8kB. + 12288, + 16384, + 20480, + 24576, + 28672, + 32768, // Common L1 data cache size. + 40960, + 49152, + 65536, + 131072, // Old L2 cache size. + 192 * 1024, + 256 * 1024, // Old L2 cache size. + 384 * 1024, + 512 * 1024, // Old L2 cache size. + 768 * 1024, + 1 << 20, // 1 MB = common L2 cache size. + (1024 + 256) * 1024, // 1.25 + (1024 + 512) * 1024, // 1.5 + (1024 + 768) * 1024, // 1.75 + 1 << 21, // 2 MB = common L2 cache size. + (2048 + 256) * 1024, // 2.25 + (2048 + 512) * 1024, // 2.5 + (2048 + 768) * 1024, // 2.75 + 3072 * 1024, // 3 MB = common L2 cache sized. + 1 << 22, // 4 MB + 5242880, // 5 megs + 6291456, // 6 megs (std L2 cache size) + 16 * 1024 * 1024, + 64 * 1024 * 1024, + 0 +}; + +//---------------------------------------- +// Under CeGCC, the math.h log2() function +// turned out to be very inaccurate e.g. +// log2(8)=1.44, so I have here hard-coded +// the logarithms. +// +static double chunk_sizes_log2[] = +{ + 8, + 9, + 9.585, + 10, + 11, + 11.585, + 12, + 12.585, + 13, // 8 kB + 13.585, + 14, // 16 kB + 14.3219, // 20 kB + 14.585, // 24 kB + 14.8074, // 28 kB + 15, // 32 kB + 15.3219, // 40 kB + 15.585, // 48 kB + 16, // 64 kB + 17, // 128 kB + 17.585, // 192 kB + 18, // 256 kB + 18.585, // 385 kB + 19, // 512 kB + 19.585, // 768 kB + 20, // 1 MB + 20.3219, // 1.25 + 20.585, // 1.5 + 20.8074, // 1.75 + 21, // 2 MB + 21.1699, // 2.25 MB + 21.3219, // 2.5 MB + 21.4594, // 2.75 MB + 21.585, // 3 MB + 22, // 4 MB + 22.3219, + 22.585, + 24, + 26, + 0 +}; + +static int min_chunk_size = 1; // These are determined in graph_draw_labels(). +static int max_chunk_size = 1; + +//---------------------------------------------------------------------------- +// Name: error +// Purpose: Complain and exit. +//---------------------------------------------------------------------------- +void error (char *s) +{ +#ifndef __WIN32__ + fprintf (stderr, "Error: %s\n", s); + exit (1); +#else + wchar_t tmp [200]; + int i; + for (i = 0; s[i]; i++) + tmp[i] = s[i]; + tmp[i] = 0; + MessageBoxW (0, tmp, L"Error", 0); + ExitProcess (0); +#endif +} + +void +dump_hex64 (unsigned long long value) +{ + unsigned long long v2 = value; + int i = 16; + while (i--) { + unsigned long long tmp = v2 >> 60; + unsigned int tmp2 = (unsigned int) tmp; + printf ("%1x", tmp2); + v2 <<= 4; + } +} + +//============================================================================ +// Graphing logic. +//============================================================================ + +//---------------------------------------------------------------------------- +// Name: graph_draw_labels +// Purpose: Draw the labels and ticks. +//---------------------------------------------------------------------------- +void +graph_draw_labels () +{ + int i; + + //---------------------------------------- + // Horizontal + // + //-------------------- + // Establish min & max. + // + min_chunk_size = 1000; + max_chunk_size = 0; + i = 0; + int j; + while ((j = chunk_sizes_log2 [i])) { + if (j < min_chunk_size) + min_chunk_size = j; + if (j > max_chunk_size) + max_chunk_size = j; + i++; + } + + for (i = min_chunk_size; i <= max_chunk_size; i++) { + char str[20]; + int x = graph_left_margin + + ((i-min_chunk_size) * graph_x_span) / + (max_chunk_size - min_chunk_size); + int y = graph_height - graph_margin + 10; + + unsigned long amt = 1 << i; + if (amt < 1024) + sprintf (str, "%ld B", amt); + else if (amt < (1<<20)) { + sprintf (str, "%ld kB", amt >> 10); + } + else { + j = amt >> 20; + switch ((amt >> 18) & 3) { + case 0: sprintf (str, "%d MB", j); break; + case 1: sprintf (str, "%d.25 MB", j); break; + case 2: sprintf (str, "%d.5 MB", j); break; + case 3: sprintf (str, "%d.75 MB", j); break; + } + } + + BMP_vline (graph, x, y, y-10, RGB_BLACK); + BMP_draw_mini_string (graph, str, x - 10, y+8, RGB_BLACK); + } + + //---------------------------------------- + // Vertical + // + for (i = 0; i <= (max_bandwidth/10000); i++) { + char str[20]; + int x = graph_left_margin - 10; + int y = graph_height - graph_margin - + (i * graph_y_span) / (max_bandwidth/10000); + + BMP_hline (graph, x, x+10, y, RGB_BLACK); + + sprintf (str, "%d GB/s", i); + BMP_draw_mini_string (graph, str, + x - 40, y - MINIFONT_HEIGHT/2, RGB_BLACK); + } +} + +void +graph_init () +{ + if (!graph) + return; + + BMP_clear (graph, RGB_WHITE); + + BMP_hline (graph, graph_left_margin, graph_width - graph_margin, + graph_height - graph_margin, RGB_BLACK); + BMP_vline (graph, graph_left_margin, graph_margin, + graph_height - graph_margin, RGB_BLACK); + + graph_x_span = graph_width - (graph_margin + graph_left_margin); + graph_y_span = graph_height - 2 * graph_margin; + + BMP_draw_mini_string (graph, graph_title, + graph_left_margin, graph_margin/2, RGB_BLACK); + + legend_y = graph_margin; +} + +void +graph_new_line (char *str, unsigned long color) +{ + BMP_draw_mini_string (graph, str, + graph_width - graph_margin - 200, legend_y, color); + + legend_y += 10; + + graph_fg = color; + graph_last_x = graph_last_y = -1; + + if (graph_data_index >= MAX_GRAPH_DATA-2) + error ("Too many graph data."); + + graph_data [graph_data_index++] = DATUM_COLOR; + graph_data [graph_data_index++] = (long) color; +} + +//---------------------------------------------------------------------------- +// Name: graph_add_point +// Purpose: Adds a point to this list to be drawn. +//---------------------------------------------------------------------------- +void +graph_add_point (int size, int amount) +{ + if (graph_data_index >= MAX_GRAPH_DATA-4) + error ("Too many graph data."); + + graph_data [graph_data_index++] = DATUM_SIZE; + graph_data [graph_data_index++] = size; + graph_data [graph_data_index++] = DATUM_AMOUNT; + graph_data [graph_data_index++] = amount; +} + +//---------------------------------------------------------------------------- +// Name: graph_plot +// Purpose: Plots a point on the current graph. +//---------------------------------------------------------------------------- +void +graph_plot (int size, int amount) +{ + //---------------------------------------- + // Get the log2 of the chunk size. + // We cannot rely on the libm math.h log2 + // function, because under CeGCC, + // log2(8) = 1.44. + // + int i = chunk_index; + while (chunk_sizes [i] && chunk_sizes [i] != size) + i++; + if (!chunk_sizes [i]) + error ("Lookup of chunk size failed."); + double tmp = chunk_sizes_log2 [i]; + + //---------------------------------------- + // Plot the point. The x axis is + // logarithmic, base 2. + // + tmp -= (double) min_chunk_size; + tmp *= (double) graph_x_span; + tmp /= (double) (max_chunk_size - min_chunk_size); + + int x = graph_left_margin + (int) tmp; + int y = graph_height - graph_margin - + (amount * graph_y_span) / max_bandwidth; + +// Really I ought to save all data points, take max of everything, then plot. + + if (graph_last_x != -1 && graph_last_y != -1) { + BMP_line (graph, graph_last_x, graph_last_y, x, y, graph_fg); + } + + graph_last_x = x; + graph_last_y = y; +} + +//---------------------------------------------------------------------------- +// Name: graph_make +// Purpose: Plots all lines. +//---------------------------------------------------------------------------- +void +graph_make () +{ + int i; + + //---------------------------------------- + // Get the maximum bandwidth in order to + // properly scale the graph vertically. + // + max_bandwidth = 0; + for (i = 0; i < graph_data_index; i += 2) { + if (graph_data[i] == DATUM_AMOUNT) { + int amt = graph_data[i+1]; + if (amt > max_bandwidth) + max_bandwidth = amt; + } + } + max_bandwidth /= 10000; + max_bandwidth *= 10000; + max_bandwidth += 10000; + + graph_draw_labels (); + + //---------------------------------------- + // OK, now draw the lines. + // + int size = -1, amt = -1; + for (i = 0; i < graph_data_index; i += 2) + { + int type = graph_data[i]; + long value = graph_data[i+1]; + + switch (type) { + case DATUM_AMOUNT: amt = value; break; + case DATUM_SIZE: size = value; break; + case DATUM_COLOR: + graph_fg = (unsigned long) value; + graph_last_x = -1; + graph_last_y = -1; + break; + } + + if (amt != -1 && size != -1) { + graph_plot (size, amt); + amt = size = -1; + } + } +} + +//============================================================================ +// Output buffer logic. +//============================================================================ + +#define MSGLEN 10000 +static wchar_t msg [MSGLEN]; + +void print (wchar_t *s) +{ + wcscat (msg, s); +} + +void newline () +{ + wcscat (msg, L"\n"); +} + +void println (wchar_t *s) +{ + wcscat (msg, s); + newline (); +} + +void print_int (int d) +{ +#if defined(__WIN32__) && (defined(__arm__) || defined(__aarch64__)) + swprintf (msg + wcslen (msg), L"%d", d); +#else + swprintf (msg + wcslen (msg), MSGLEN, L"%d", d); +#endif +} + +void println_int (int d) +{ + print_int (d); + newline (); +} + +void print_result (long double result) +{ +#if defined(__WIN32__) && (defined(__arm__) || defined(__aarch64__)) + swprintf (msg + wcslen (msg), L"%.1Lf MB/s", result); +#else + swprintf (msg + wcslen (msg), MSGLEN, L"%.1Lf MB/s", result); +#endif +} + +void dump (FILE *f) +{ + if (!f) + f = stdout; + + int i = 0; + while (msg[i]) { + char ch = (char) msg[i]; + fputc (ch, f); + i++; + } + + msg [0] = 0; +} + +void flush () +{ +#if defined(__WIN32__) && (defined(__arm__) || defined(__aarch64__)) + MessageBeep (MB_OK); +#else + dump (NULL); + fflush (stdout); +#endif +} + +void print_size (unsigned long size) +{ + if (size < 1024) { + print_int (size); + print (L" B"); + } + else if (size < (1<<20)) { + print_int (size >> 10); + print (L" kB"); + } else { + print_int (size >> 20); + switch ((size >> 18) & 3) { + case 1: print (L".25"); break; + case 2: print (L".5"); break; + case 3: print (L".75"); break; + } + print (L" MB"); + } +} + +//============================================================================ +// Timing logic. +//============================================================================ + +//---------------------------------------------------------------------------- +// Name: mytime +// Purpose: Reports time in microseconds. +//---------------------------------------------------------------------------- +unsigned long mytime () +{ +#ifndef __WIN32__ + struct timeval tv; + struct timezone tz; + memset (&tz, 0, sizeof(struct timezone)); + gettimeofday (&tv, &tz); + return 1000000 * tv.tv_sec + tv.tv_usec; +#else + return 1000 * GetTickCount (); // accurate enough. +#endif +} + +//---------------------------------------------------------------------------- +// Name: calculate_result +// Purpose: Calculates and prints a result. +// Returns: 10 times the number of megabytes per second. +//---------------------------------------------------------------------------- +int +calculate_result (unsigned long chunk_size, long long total_count, long diff) +{ + if (!diff) + error ("Zero time difference."); + +// printf ("\nIn calculate_result, chunk_size=%ld, total_count=%lld, diff=%ld\n", chunk_size, total_count, diff); + long double result = (long double) chunk_size; + result *= (long double) total_count; + result *= 1000000.; + result /= 1048576.; + result /= (long double) diff; + + print_result (result); + + return (long) (10.0 * result); +} + +//============================================================================ +// Tests. +//============================================================================ + +//---------------------------------------------------------------------------- +// Name: do_write +// Purpose: Performs write on chunk of memory of specified size. +//---------------------------------------------------------------------------- +enum { + NO_SSE2, + SSE2, + SSE2_BYPASS, +}; + +static void *do_thread_write(void *arg) +{ + struct thread_params *params = (struct thread_params *)arg; + unsigned long total_count = 0; +#if defined(__x86_64__) || defined(__aarch64__) + unsigned long value = 0x1234567689abcdef; +#else + unsigned long value = 0x12345678; +#endif + cpu_set_t mask; + + CPU_ZERO(&mask); + CPU_SET(params->id % cpu_num, &mask); + + if (pthread_setaffinity_np(pthread_self(), sizeof(mask), &mask) < 0) + fprintf(stderr, "set thread %d affinity failed\n", params->id); + + while(goon_flag) { + total_count += params->loops; + + if (params->random) + RandomWriter (params->chunk_ptrs, params->size/256, params->loops, value); + else + Writer (params->chunk, params->size, params->loops, value); + } + + params->loops = total_count; + + pthread_exit(NULL); + + return NULL; +} + + +int +do_write (unsigned long size, int mode, bool random) +{ + unsigned char *chunk; + unsigned char *chunk0; + unsigned long loops; + unsigned long long total_count=0; + unsigned long diff=0, t0; + unsigned long tmp; + unsigned long **chunk_ptrs = NULL; + struct thread_params *params; + pthread_t *tid; + int i, rval; + + if (size & 255) + error ("do_write(): chunk size is not multiple of 256."); + + params = malloc(sizeof(struct thread_params) * thread_num); + if (!params) + error ("Out of memory"); + + tid = malloc(sizeof(pthread_t) * thread_num); + if (!tid) + error ("Out of memory"); + + //------------------------------------------------- + chunk0 = malloc (size+32); + chunk = chunk0; + if (!chunk) + error ("Out of memory"); + + tmp = (unsigned long) chunk; + if (tmp & 15) { + tmp -= (tmp & 15); + tmp += 16; + chunk = (unsigned char*) tmp; + } + + //---------------------------------------- + // Set up random pointers to chunks. + // + if (random) { + tmp = size/256; + chunk_ptrs = (unsigned long**) malloc (sizeof (unsigned long*) * tmp); + if (!chunk_ptrs) + error ("Out of memory."); + + //---------------------------------------- + // Store pointers to all chunks into array. + // + int i; + for (i = 0; i < tmp; i++) { + chunk_ptrs [i] = (unsigned long*) (chunk + 256 * i); + } + + //---------------------------------------- + // Randomize the array of chunk pointers. + // + int k = 100; + while (k--) { + for (i = 0; i < tmp; i++) { + int j = rand() % tmp; + if (i != j) { + unsigned long *ptr = chunk_ptrs [i]; + chunk_ptrs [i] = chunk_ptrs [j]; + chunk_ptrs [j] = ptr; + } + } + } + } + + //------------------------------------------------- + if (random) + print (L"Random write "); + else + print (L"Sequential write "); + + if (mode == SSE2) { + print (L"(128-bit), size = "); + } + else + if (mode == SSE2_BYPASS) { + print (L"bypassing cache (128-bit), size = "); + } else { +#if defined(__x86_64__) || defined(__aarch64__) + print (L"(64-bit), size = "); +#else + print (L"(32-bit), size = "); +#endif + } + + print_size (size); + print (L", "); + + loops = (1 << 26) / size;// XX need to adjust for CPU MHz + + tmp = size / thread_num; + + for (i = 0; i < thread_num; i++) { + params[i].id = i; + params[i].random = random; + params[i].size = tmp < 1024 ? size : tmp; + if (random) + params[i].chunk_ptrs = tmp < 1024 ? chunk_ptrs : chunk_ptrs + i * (tmp / 256); + else + params[i].chunk = tmp < 1024 ? chunk : chunk + i * tmp; + params[i].loops = loops; + } + + t0 = mytime (); + + goon_flag = 1; + + for (i = 0; i < thread_num; i++) { + rval = pthread_create(&tid[i] ,NULL, do_thread_write, ¶ms[i]); + if (rval < 0) { + perror("can't create pthread\n"); + return rval; + } + } + + usleep(usec_per_test); + + goon_flag = 0; + + for (i = 0; i < thread_num; i++) { + pthread_join(tid[i], NULL); + total_count += params[i].loops; + } + + diff = mytime () - t0; + + total_count /= thread_num; + + print (L"loops = "); + print_int (total_count); + print (L", "); + + flush (); + + int result = calculate_result (size, total_count, diff); + newline (); + + flush (); + + free ((void*)chunk0); + + if (chunk_ptrs) + free (chunk_ptrs); + + return result; +} + +static void *do_thread_read(void *arg) +{ + struct thread_params *params = (struct thread_params *)arg; + unsigned long total_count = 0; + cpu_set_t mask; + + CPU_ZERO(&mask); + CPU_SET(params->id % cpu_num, &mask); + + if (pthread_setaffinity_np(pthread_self(), sizeof(mask), &mask) < 0) + fprintf(stderr, "set thread %d affinity failed\n", params->id); + + while(goon_flag) { + total_count += params->loops; + + if (params->random) + RandomReader (params->chunk_ptrs, params->size/256, params->loops); + else + Reader (params->chunk, params->size, params->loops); + } + + params->loops = total_count; + + pthread_exit(NULL); + + return NULL; +} + +//---------------------------------------------------------------------------- +// Name: do_read +// Purpose: Performs sequential read on chunk of memory of specified size. +//---------------------------------------------------------------------------- +int +do_read (unsigned long size, bool use_sse2, bool random) +{ + unsigned long diff=0; + unsigned long long total_count = 0; + unsigned char *chunk; + unsigned char *chunk0; + unsigned long tmp; + unsigned long **chunk_ptrs = NULL; + unsigned long t0, loops = (1 << 26) / size; // XX need to adjust for CPU MHz + struct thread_params *params; + pthread_t *tid; + int i, rval; + + if (size & 255) + error ("do_read(): chunk size is not multiple of 256."); + + params = malloc(sizeof(struct thread_params) * thread_num); + if (!params) + error ("Out of memory"); + + tid = malloc(sizeof(pthread_t) * thread_num); + if (!tid) + error ("Out of memory"); + + //------------------------------------------------- + if (random) + print (L"Random read "); + else + print (L"Sequential read "); + + if (use_sse2) { + print (L"(128-bit), size = "); + } else { +#if defined(__x86_64__) || defined(__aarch64__) + print (L"(64-bit), size = "); +#else + print (L"(32-bit), size = "); +#endif + } + + print_size (size); + print (L", "); + + flush (); + + //------------------------------------------------- + chunk0 = chunk = malloc (size+32); + if (!chunk) + error ("Out of memory"); + + memset (chunk, 0, size); + + tmp = (unsigned long) chunk; + if (tmp & 15) { + tmp -= (tmp & 15); + tmp += 16; + chunk = (unsigned char*) tmp; + } + + //---------------------------------------- + // Set up random pointers to chunks. + // + if (random) { + int tmp = size/256; + chunk_ptrs = (unsigned long**) malloc (sizeof (unsigned long*) * tmp); + if (!chunk_ptrs) + error ("Out of memory."); + + //---------------------------------------- + // Store pointers to all chunks into array. + // + int i; + for (i = 0; i < tmp; i++) { + chunk_ptrs [i] = (unsigned long*) (chunk + 256 * i); + } + + //---------------------------------------- + // Randomize the array of chunk pointers. + // + int k = 100; + while (k--) { + for (i = 0; i < tmp; i++) { + int j = rand() % tmp; + if (i != j) { + unsigned long *ptr = chunk_ptrs [i]; + chunk_ptrs [i] = chunk_ptrs [j]; + chunk_ptrs [j] = ptr; + } + } + } + } + + tmp = size / thread_num; + + for (i = 0; i < thread_num; i++) { + params[i].id = i; + params[i].random = random; + params[i].size = tmp < 1024 ? size : tmp; + if (random) + params[i].chunk_ptrs = tmp < 1024 ? chunk_ptrs : chunk_ptrs + i * (tmp / 256); + else + params[i].chunk = tmp < 1024 ? chunk : chunk + i * tmp; + params[i].loops = loops; + } + + t0 = mytime (); + + goon_flag = 1; + + for (i = 0; i < thread_num; i++) { + rval = pthread_create(&tid[i] ,NULL, do_thread_read, ¶ms[i]); + if (rval < 0) { + perror("can't create pthread\n"); + return rval; + } + } + + usleep(usec_per_test); + + goon_flag = 0; + + for (i = 0; i < thread_num; i++) { + pthread_join(tid[i], NULL); + total_count += params[i].loops; + } + + diff = mytime () - t0; + + total_count /= thread_num; + + print (L"loops = "); + print_int (total_count); + print (L", "); + + int result = calculate_result (size, total_count, diff); + newline (); + + flush (); + + free (chunk0); + + if (chunk_ptrs) + free (chunk_ptrs); + + free(params); + free(tid); + + return result; +} + + + +//---------------------------------------------------------------------------- +// Name: do_copy +// Purpose: Performs sequential memory copy. +//---------------------------------------------------------------------------- +int +do_copy (unsigned long size, int mode) +{ + unsigned long loops; + unsigned long long total_count = 0; + unsigned long t0, diff=0; + unsigned char *chunk_src; + unsigned char *chunk_dest; + unsigned char *chunk_src0; + unsigned char *chunk_dest0; + unsigned long tmp; + + if (size & 255) + error ("do_copy(): chunk size is not multiple of 256."); + + //------------------------------------------------- + chunk_src0 = chunk_src = malloc (size+32); + if (!chunk_src) + error ("Out of memory"); + chunk_dest0 = chunk_dest = malloc (size+32); + if (!chunk_dest) + error ("Out of memory"); + + memset (chunk_src, 100, size); + memset (chunk_dest, 200, size); + + tmp = (unsigned long) chunk_src; + if (tmp & 15) { + tmp -= (tmp & 15); + tmp += 16; + chunk_src = (unsigned char*) tmp; + } + tmp = (unsigned long) chunk_dest; + if (tmp & 15) { + tmp -= (tmp & 15); + tmp += 16; + chunk_dest = (unsigned char*) tmp; + } + + //------------------------------------------------- + print (L"Sequential copy "); + + if (mode == SSE2) { + print (L"(128-bit), size = "); + } + else { +#if defined(__x86_64__) || defined(__aarch64__) + print (L"(64-bit), size = "); +#else + print (L"(32-bit), size = "); +#endif + } + + print_size (size); + print (L", "); + + flush (); + + loops = (1 << 26) / size; // XX need to adjust for CPU MHz + + t0 = mytime (); + + while (diff < usec_per_test) { + total_count += loops; + +#if !defined(__arm__) && !defined(__aarch64__) + if (mode == SSE2) + CopySSE (chunk_dest, chunk_src, size, loops); +#if 0 + else + Copy (chunk_dest, chunk_src, size, loops); +#endif +#endif + + diff = mytime () - t0; + } + + print (L"loops = "); + print_int (total_count); + print (L", "); + + int result = calculate_result (size, total_count, diff); + newline (); + + flush (); + + free (chunk_src0); + free (chunk_dest0); + + return result; +} + + +//---------------------------------------------------------------------------- +// Name: fb_readwrite +// Purpose: Performs sequential read & write tests on framebuffer memory. +//---------------------------------------------------------------------------- +#if defined(__linux__) && defined(FBIOGET_FSCREENINFO) +void +fb_readwrite (bool use_sse2) +{ + //unsigned long counter, total_count; + unsigned long total_count; + unsigned long length; + unsigned long diff, t0; + static struct fb_fix_screeninfo fi; + static struct fb_var_screeninfo vi; + unsigned long *fb = NULL; + //unsigned long datum; + int fd; + //register unsigned long foo; +#if defined(__x86_64__) || defined(__aarch64__) + unsigned long value = 0x1234567689abcdef; +#else + unsigned long value = 0x12345678; +#endif + + //------------------------------------------------- + + fd = open ("/dev/fb0", O_RDWR); + if (fd < 0) + fd = open ("/dev/fb/0", O_RDWR); + if (fd < 0) { + println (L"Cannot open framebuffer device."); + return; + } + + if (ioctl (fd, FBIOGET_FSCREENINFO, &fi)) { + close (fd); + println (L"Cannot get framebuffer info"); + return; + } + else + if (ioctl (fd, FBIOGET_VSCREENINFO, &vi)) { + close (fd); + println (L"Cannot get framebuffer info"); + return; + } + else + { + if (fi.visual != FB_VISUAL_TRUECOLOR && + fi.visual != FB_VISUAL_DIRECTCOLOR ) { + close (fd); + println (L"Need direct/truecolor framebuffer device."); + return; + } else { + unsigned long fblen; + + print (L"Framebuffer resolution: "); + print_int (vi.xres); + print (L"x"); + print_int (vi.yres); + print (L", "); + print_int (vi.bits_per_pixel); + println (L" bpp\n"); + + fb = (unsigned long*) fi.smem_start; + fblen = fi.smem_len; + + fb = mmap (fb, fblen, + PROT_WRITE | PROT_READ, + MAP_SHARED, fd, 0); + if (fb == MAP_FAILED) { + close (fd); + println (L"Cannot access framebuffer memory."); + return; + } + } + } + + //------------------- + // Use only the upper half of the display. + // + length = FB_SIZE; + + //------------------- + // READ + // + print (L"Framebuffer memory sequential read "); + flush (); + + t0 = mytime (); + + total_count = FBLOOPS_R; + +#if !defined(__arm__) && !defined(__aarch64__) + if (use_sse2) + ReaderSSE2 (fb, length, FBLOOPS_R); + else +#endif + Reader (fb, length, FBLOOPS_R); + + diff = mytime () - t0; + + calculate_result (length, total_count, diff); + newline (); + + //------------------- + // WRITE + // + print (L"Framebuffer memory sequential write "); + flush (); + + t0 = mytime (); + + total_count = FBLOOPS_W; + +#if !defined(__arm__) && !defined(__aarch64__) + if (use_sse2) + WriterSSE2_bypass (fb, length, FBLOOPS_W, value); + else +#endif + Writer (fb, length, FBLOOPS_W, value); + + diff = mytime () - t0; + + calculate_result (length, total_count, diff); + newline (); +} +#endif + +//---------------------------------------------------------------------------- +// Name: register_test +// Purpose: Determines bandwidth of register-to-register transfers. +//---------------------------------------------------------------------------- +void +register_test () +{ + long long total_count = 0; + unsigned long t0; + unsigned long diff = 0; + + //-------------------------------------- +#if defined(__x86_64__) || defined(__aarch64__) + print (L"Main register to main register transfers (64-bit) "); +#else + print (L"Main register to main register transfers (32-bit) "); +#endif + flush (); +#define REGISTER_COUNT 10000 + + t0 = mytime (); + while (diff < usec_per_test) + { + RegisterToRegister (REGISTER_COUNT); + total_count += REGISTER_COUNT; + + diff = mytime () - t0; + } + + calculate_result (256, total_count, diff); + newline (); + flush (); + +#if !defined(__arm__) && !defined(__aarch64__) + //-------------------------------------- +#ifdef __x86_64__ + print (L"Main register to vector register transfers (64-bit) "); +#else + print (L"Main register to vector register transfers (32-bit) "); +#endif + flush (); +#define VREGISTER_COUNT 3333 + + t0 = mytime (); + diff = 0; + total_count = 0; + while (diff < usec_per_test) + { + RegisterToVector (VREGISTER_COUNT); + total_count += VREGISTER_COUNT; + + diff = mytime () - t0; + } + + calculate_result (256, total_count, diff); + newline (); + flush (); + + //-------------------------------------- +#ifdef __x86_64__ + print (L"Vector register to main register transfers (64-bit) "); +#else + print (L"Vector register to main register transfers (32-bit) "); +#endif + flush (); + + t0 = mytime (); + diff = 0; + total_count = 0; + while (diff < usec_per_test) + { + VectorToRegister (VREGISTER_COUNT); + total_count += VREGISTER_COUNT; + + diff = mytime () - t0; + } + + calculate_result (256, total_count, diff); + newline (); + flush (); + + //-------------------------------------- + print (L"Vector register to vector register transfers (128-bit) "); + flush (); + + t0 = mytime (); + diff = 0; + total_count = 0; + while (diff < usec_per_test) + { + VectorToVector (VREGISTER_COUNT); + total_count += VREGISTER_COUNT; + + diff = mytime () - t0; + } + + calculate_result (256, total_count, diff); + newline (); + flush (); + + //-------------------------------------- + if (use_sse4) { + print (L"Vector 8-bit datum to main register transfers "); + flush (); + + t0 = mytime (); + diff = 0; + total_count = 0; + while (diff < usec_per_test) + { + Vector8ToRegister (VREGISTER_COUNT); + total_count += VREGISTER_COUNT; + + diff = mytime () - t0; + } + + calculate_result (256, total_count, diff); + newline (); + flush (); + } + + //-------------------------------------- + print (L"Vector 16-bit datum to main register transfers "); + flush (); + + t0 = mytime (); + diff = 0; + total_count = 0; + while (diff < usec_per_test) + { + Vector16ToRegister (VREGISTER_COUNT); + total_count += VREGISTER_COUNT; + + diff = mytime () - t0; + } + + calculate_result (256, total_count, diff); + newline (); + flush (); + + //-------------------------------------- + if (use_sse4) { + print (L"Vector 32-bit datum to main register transfers "); + flush (); + + t0 = mytime (); + diff = 0; + total_count = 0; + while (diff < usec_per_test) + { + Vector32ToRegister (VREGISTER_COUNT); + total_count += VREGISTER_COUNT; + + diff = mytime () - t0; + } + + calculate_result (256, total_count, diff); + newline (); + flush (); + } + +#ifdef __x86_64__ + //-------------------------------------- + print (L"Vector 64-bit datum to main register transfers "); + flush (); + + t0 = mytime (); + diff = 0; + total_count = 0; + while (diff < usec_per_test) + { + Vector64ToRegister (VREGISTER_COUNT); + total_count += VREGISTER_COUNT; + + diff = mytime () - t0; + } + + calculate_result (256, total_count, diff); + newline (); + flush (); +#endif + + //-------------------------------------- + if (use_sse4) { + print (L"Main register 8-bit datum to vector register transfers "); + flush (); + + t0 = mytime (); + diff = 0; + total_count = 0; + while (diff < usec_per_test) + { + Register8ToVector (VREGISTER_COUNT); + total_count += VREGISTER_COUNT; + + diff = mytime () - t0; + } + + calculate_result (256, total_count, diff); + newline (); + flush (); + } + + //-------------------------------------- + print (L"Main register 16-bit datum to vector register transfers "); + flush (); + + t0 = mytime (); + diff = 0; + total_count = 0; + while (diff < usec_per_test) + { + Register16ToVector (VREGISTER_COUNT); + total_count += VREGISTER_COUNT; + + diff = mytime () - t0; + } + + calculate_result (256, total_count, diff); + newline (); + flush (); + + //-------------------------------------- + if (use_sse4) { + print (L"Main register 32-bit datum to vector register transfers "); + flush (); + + t0 = mytime (); + diff = 0; + total_count = 0; + while (diff < usec_per_test) + { + Register32ToVector (VREGISTER_COUNT); + total_count += VREGISTER_COUNT; + + diff = mytime () - t0; + } + + calculate_result (256, total_count, diff); + newline (); + flush (); + } + +#ifdef __x86_64__ + //-------------------------------------- + print (L"Main register 64-bit datum to vector register transfers "); + flush (); + + t0 = mytime (); + diff = 0; + total_count = 0; + while (diff < usec_per_test) + { + Register64ToVector (VREGISTER_COUNT); + total_count += VREGISTER_COUNT; + + diff = mytime () - t0; + } + + calculate_result (256, total_count, diff); + newline (); + flush (); +#endif +#endif +} + +//---------------------------------------------------------------------------- +// Name: stack_test +// Purpose: Determines bandwidth of stack-to/from-register transfers. +//---------------------------------------------------------------------------- +void +stack_test () +{ + long long total_count = 0; + unsigned long t0; + unsigned long diff = 0; + +#if defined(__x86_64__) || defined(__aarch64__) + print (L"Stack-to-register transfers (64-bit) "); +#else + print (L"Stack-to-register transfers (32-bit) "); +#endif + flush (); + + //-------------------------------------- + diff = 0; + total_count = 0; + t0 = mytime (); + while (diff < usec_per_test) + { + StackReader (REGISTER_COUNT); + total_count += REGISTER_COUNT; + + diff = mytime () - t0; + } + + calculate_result (256, total_count, diff); + newline (); + flush (); + +#if defined(__x86_64__) || defined(__aarch64__) + print (L"Register-to-stack transfers (64-bit) "); +#else + print (L"Register-to-stack transfers (32-bit) "); +#endif + flush (); + + //-------------------------------------- + diff = 0; + total_count = 0; + t0 = mytime (); + while (diff < usec_per_test) + { + StackWriter (REGISTER_COUNT); + total_count += REGISTER_COUNT; + + diff = mytime () - t0; + } + + calculate_result (256, total_count, diff); + newline (); + flush (); +} + +//---------------------------------------------------------------------------- +// Name: library_test +// Purpose: Performs C library tests (memset, memcpy). +//---------------------------------------------------------------------------- +void +library_test () +{ + char *a1, *a2; + unsigned long t, t0; + int i; + + +#if defined(__WIN32__) && (defined(__arm__) || defined(__aarch64__)) + #define NT_SIZE (1024*1024) + #define NT_SIZE2 (50) +#elif !defined(__WIN32__) && (defined(__arm__) || defined(__aarch64__)) +#if defined(DRAM_SIZE_SMALL) + #define NT_SIZE (16*1024*1024) +#else + #define NT_SIZE (32*1024*1024) +#endif + #define NT_SIZE2 (50) +#else + #define NT_SIZE (64*1024*1024) + #define NT_SIZE2 (100) +#endif + + a1 = malloc (NT_SIZE); + if (!a1) + error ("Out of memory"); + + a2 = malloc (NT_SIZE); + if (!a2) + error ("Out of memory"); + + //-------------------------------------- + t0 = mytime (); + for (i=0; i<NT_SIZE2; i++) { + memset (a1, i, NT_SIZE); + } + t = mytime (); + + print (L"Library: memset "); + calculate_result (NT_SIZE, NT_SIZE2, t-t0); + newline (); + + flush (); + + //-------------------------------------- + t0 = mytime (); + for (i=0; i<NT_SIZE2; i++) { + memcpy (a2, a1, NT_SIZE); + } + t = mytime (); + + print (L"Library: memcpy "); + calculate_result (NT_SIZE, NT_SIZE2, t-t0); + newline (); + + flush (); + + free (a1); + free (a2); +} + +//---------------------------------------------------------------------------- +// Name: network_test_core +// Purpose: Performs the network test, talking to and receiving data +// back from a transponder node. +// Note: Port number specified using server:# notation. +// Returns: -1 on error, else the network duration in microseconds. +//---------------------------------------------------------------------------- +long +network_test_core (const char *net_path, char *chunk, + unsigned long chunk_size, + unsigned long count) +{ + char hostname [PATH_MAX]; + char *s; + int port = NETWORK_DEFAULT_PORTNUM ; + strcpy (hostname, net_path); + if ((s = strchr (hostname, ':'))) { + *s++ = 0; + port = atoi (s); + } + + struct hostent* host = gethostbyname (hostname); + if (!host) + return -1; + + char *host_ip = inet_ntoa (*(struct in_addr *)*host->h_addr_list); + int sock = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP); + + struct sockaddr_in addr; + addr.sin_family = AF_INET; + addr.sin_addr.s_addr = inet_addr(host_ip); + addr.sin_port = htons(port); + + if (connect (sock, (struct sockaddr*) &addr, sizeof (struct sockaddr))) + { + // perror ("connect"); + close (sock); + return -1; + } + + //------------------------------------ + // Send all of our data. + // + unsigned long t0 = mytime (); + int i; + for (i = 0; i < count; i++) + send (sock, chunk, chunk_size, 0); + +#if 0 + //------------------------------------ + // Set nonblocking mode. + // + int opt = 1; + ioctl (sock, FIONBIO, &opt); +#endif + + //------------------------------------ + // Read the response. + // + char *buffer = malloc (chunk_size); + if (!buffer) { + close (sock); + // perror ("malloc"); + return -1; + } + int amount = recv (sock, buffer, chunk_size, 0); + if (amount <= 0) { + close (sock); + //perror ("recv"); + return -1; + } + + long t = mytime () - t0; + close (sock); + free (buffer); + return t; +} + +//---------------------------------------------------------------------------- +// Name: ip_to_str +//---------------------------------------------------------------------------- +void +ip_to_str (unsigned long addr, char *str) +{ + if (!str) + return; + + unsigned short a = 0xff & addr; + unsigned short b = 0xff & (addr >> 8); + unsigned short c = 0xff & (addr >> 16); + unsigned short d = 0xff & (addr >> 24); + sprintf (str, "%u.%u.%u.%u", a,b,c,d); +} + +//---------------------------------------------------------------------------- +// Name: network_transponder +// Purpose: Act as a transponder, receiving chunks of data and sending +// back an acknowledgement once the enture chunk is read. +// Returns: False if a problem occurs setting up the network socket. +//---------------------------------------------------------------------------- +bool +network_transponder () +{ + struct sockaddr_in sin, from; + + //------------------------------ + // Get listening socket for port. + // Then listen on given port#. + // + sin.sin_family = AF_INET; + sin.sin_addr.s_addr = htonl(INADDR_ANY); + sin.sin_port = htons(NETWORK_DEFAULT_PORTNUM); + int listensock; + if ((listensock = socket (AF_INET, SOCK_STREAM, 0)) < 0) { + perror ("socket"); + return false; + } + if (bind (listensock, (struct sockaddr*) &sin, sizeof(sin)) < 0) { + perror ("bind"); + close (listensock); + return false; + } + if (listen (listensock, 500) < 0) { + perror ("listen"); + close (listensock); + return false; + } + + bool done = false; + while (!done) { + //---------------------------------------- + // Wait for a client to contact us. + // + socklen_t len = sizeof (struct sockaddr); + int sock = accept (listensock, (struct sockaddr*) &from, &len); + if (sock < 0) { + perror ("accept"); + close (listensock); + return false; + } + + if (len != sizeof (struct sockaddr_in)) { + close (sock); + close (listensock); + return false; + } + +#if 0 + unsigned long ipaddr = from.sin_addr.s_addr; + char ipstring[30]; + ip_to_str (ipaddr, ipstring); + fprintf (stderr, "Incoming connection from %s\n", ipstring); +#endif + + char chunk [NETWORK_CHUNK_SIZE+1]; + long n_chunks = 0; + int amount_read = read (sock, chunk, NETWORK_CHUNK_SIZE); + chunk [amount_read] = 0; + if (1 != sscanf (chunk, "%ld", &n_chunks)) { + close (sock); + close (listensock); + return false; + } + + //---------------------------------------- + // If the leader sends us a chunk count of + // -99, this indicates that we should exit. + // + if (n_chunks == -99) { + close (sock); + close (listensock); + return true; + } + +// printf ("Reading %lu chunks of %d bytes...\n", n_chunks, NETWORK_CHUNK_SIZE); + + unsigned long long remaining = n_chunks; + remaining *= NETWORK_CHUNK_SIZE; + +// printf ("remaining="); dump_hex64(remaining); puts(""); + + remaining -= amount_read; + while (remaining > 0) { + amount_read = read (sock, chunk, NETWORK_CHUNK_SIZE); + remaining -= amount_read; + + if (amount_read < 0) { + perror ("read"); + break; + } else + if (!amount_read) + break; + } + + char *foo = "OK.\n\n"; + write (sock, foo, 4); + close (sock); + } + + return true; +} + +//---------------------------------------------------------------------------- +// Name: network_test +//---------------------------------------------------------------------------- +bool +network_test (char **destinations, int n_destinations) +{ + int i; + + //---------------------------------------- + // The memory chunk starts with a 12-byte + // length of the overall send size. + // The memory chunk will have a list of + // the destinations in it. + // In future, there will be a mechanism + // for testing bandwidth between all nodes, + // not just the leader & each of the + // transponders. + // + char chunk [NETWORK_CHUNK_SIZE]; + memset (chunk, 0, NETWORK_CHUNK_SIZE); + sprintf (chunk, "000000000000\n%d\n", n_destinations); + for (i = 0; i < n_destinations; i++) { + char *s = destinations [i]; + int chunk_len = strlen (chunk); + int len = strlen (s); + if (len + chunk_len < NETWORK_CHUNK_SIZE-1) { + //---------------------------------------- + // "transp" indicates that the given node + // has not yet been a leader. + // In future, "done" will indicate it has. + // + sprintf (chunk + chunk_len, "%s %s\n", s, "transp"); + } + } + + //---------------------------------------- + // For each destination, run the test. + // + for (i = 0; i < n_destinations; i++) { + int j = 0; + bool problem = false; + + char *hostname = destinations[i]; + printf ("Bandwidth sending to %s:\n", hostname); + + //---------------------------------------- + // Send from 8kB up to 32 MB of data. + // + while (!problem && j < 13) { + unsigned long chunk_count = 1 << j; + unsigned long long amt_to_send = chunk_count; + amt_to_send *= NETWORK_CHUNK_SIZE; + + if (!amt_to_send) // unlikely + break; + + //---------------------------------------- + // Write the overall send size into the + // 1st line of the chunk so that the + // transponder knows how large the send + // is without guessing. + // + sprintf (chunk, "%11lu", chunk_count); + chunk[11] = ' '; + + //-------------------- + // Send the data. + // + long duration = network_test_core (hostname, + chunk, NETWORK_CHUNK_SIZE, chunk_count); + if (duration == -1) { + problem = true; + fprintf (stderr, "\nCan't connect to %s\n", hostname); + } else { + unsigned long amt_in_kb = amt_to_send / 1024; + unsigned long amt_in_mb = amt_to_send / 1048576; + if (!amt_in_mb) { + printf ("\tSent %lu kB...", amt_in_kb); + } else { + printf ("\tSent %lu MB...", amt_in_mb); + } + + //------------------------------ + // Calculate rate in MB/sec. + // + // Get total # bytes. + unsigned long long tmp = NETWORK_CHUNK_SIZE; + tmp *= chunk_count; + + // Get total bytes per second. + tmp *= 1000000; + tmp /= duration; + + // Bytes to megabytes. + tmp /= 1000; + tmp /= 10; + unsigned long whole = tmp / 100; + unsigned long frac = tmp % 100; + printf ("%lu.%02lu MB/second\n", whole, frac); + } + j++; + } + + puts (""); + } + + return true; +} + +//---------------------------------------------------------------------------- +// Name: usage +//---------------------------------------------------------------------------- +void +usage () +{ + printf ("Usage for memory tests: bandwidth [--quick] [--thread N] [--chunk-size N]\n"); + printf ("Usage for starting network tests: bandwidth --network <ipaddr1> [<ipaddr2...]\n"); + printf ("Usage for receiving network tests: bandwidth --transponder\n"); + exit (0); +} + +//---------------------------------------------------------------------------- +// Name: main +//---------------------------------------------------------------------------- +int +main (int argc, char **argv) +{ + int i, j, chunk_size; + + --argc; + ++argv; + + strcpy (graph_title, TITLE); + + bool network_mode = false; + bool network_leader = false; // false => transponder + int network_destinations_size = 0; + int n_network_destinations = 0; + char **network_destinations = NULL; + + i = 0; + while (i < argc) { + char *s = argv [i++]; + if (!strcmp ("--network", s)) { + network_mode = true; + network_leader = true; + network_destinations_size = 20; + network_destinations = (char**) malloc (network_destinations_size * sizeof (char*)); + } + else + if (!strcmp ("--transponder", s)) { + network_mode = true; + } + else + if (!strcmp ("--slow", s)) { + usec_per_test=20000000; // 20 seconds per test. + } + else + if (!strcmp ("--quick", s)) { + usec_per_test = 250000; // 0.25 seconds per test. + } + else + if (!strcmp ("--nosse2", s)) { + use_sse2 = false; + use_sse4 = false; + } + else + if (!strcmp ("--nosse4", s)) { + use_sse4 = false; + } + else + if (!strcmp ("--help", s)) { + usage (); + } + else + if (!strcmp ("--title", s) && i != argc) { + sprintf (graph_title, "%s -- %s", TITLE, argv[i++]); + } + else + if (!strcmp ("--thread", s)) { + int n = 0; + thread_num = atoi(argv[i++]); + for (j = 0; j < 32; j++) + n += (thread_num >> j) & 0x1; + if (n > 1) + error("thread_num must be power of 2\n"); + } + else + if (!strcmp ("--chunk-size", s)) { + chunk_size = strtoul(argv[i++], NULL, 0); + for (j = 0; j < sizeof(chunk_sizes) / sizeof(chunk_sizes[0]); j++) { + if (chunk_size <= chunk_sizes[j]) + break; + } + chunk_index = j; + } + else { + if (!network_mode || !network_leader) + usage (); + + if ('-' == *s) + usage (); + + if (n_network_destinations >= network_destinations_size) { + network_destinations_size *= 2; + int newsize = sizeof(char*) * network_destinations_size; + network_destinations = realloc (network_destinations, + newsize); + } + + network_destinations [n_network_destinations++] = strdup (s); + } + } + + msg[0] = 0; + +#if !(defined(__WIN32__) && (defined(__arm__) || defined(__aarch64__))) + printf ("This is bandwidth version %s.\n", VERSION); + printf ("Copyright (C) 2005-2010 by Zack T Smith.\n\n"); + printf ("This software is covered by the GNU Public License.\n"); + printf ("It is provided AS-IS, use at your own risk.\n"); + printf ("See the file COPYING for more information.\n\n"); + fflush (stdout); +#else + println (L"(C) 2010 by Zack Smith"); + println (L"Under GNU Public License"); + println (L"Use at your own risk."); +#endif + + //---------------------------------------- + // If network mode selected, enter it now. + // Currently cannot combine memory tests + // & network tests. + // + if (network_mode) { + if (network_leader) { + network_test (network_destinations, n_network_destinations); + } else { + network_transponder (); + } + + puts ("Done."); + return 0; + } + +#if !defined(__arm__) && !defined(__aarch64__) + if (!has_sse2 ()) { + puts ("Processor does not have SSE2."); + use_sse2 = false; + use_sse4 = false; + } + +#ifdef __x86_64__ + if (use_sse2) + println (L"Using 128-bit and 64-bit data transfers."); + else + println (L"Using 64-bit data transfers."); +#else + if (use_sse2) + println (L"Using 128-bit and 32-bit data transfers."); + else + println (L"Using 32-bit data transfers."); +#endif + +#else + +#if defined(__aarch64__) + println (L"Using 64-bit transfers."); +#else + println (L"Using 32-bit transfers."); +#endif + + use_sse2 = false; +#endif + + println (L"Notation: kB = 1024 B, MB = 1048576 B."); + + flush (); + + //------------------------------------------------------------ + // Attempt to obtain information about the CPU. + // +#ifdef __linux__ + struct stat st; + if (!stat ("/proc/cpuinfo", &st)) { +#define TMPFILE "/tmp/bandw_tmp" + unlink (TMPFILE); + if (-1 == system ("grep MHz /proc/cpuinfo | uniq | sed \"s/[\\t\\n: a-zA-Z]//g\" > "TMPFILE)) + perror ("system"); + + FILE *f = fopen (TMPFILE, "r"); + if (f) { + float cpu_speed = 0.0; + + if (1 == fscanf (f, "%g", &cpu_speed)) { + puts (""); + printf ("CPU speed is %g MHz.\n", cpu_speed); + } + fclose (f); + } + +#if !defined(__arm__) && !defined(__aarch64__) + unlink (TMPFILE); + if (-1 == system ("grep -i sse4 /proc/cpuinfo > "TMPFILE)) + perror ("system"); + + if (!stat (TMPFILE, &st)) { + if (st.st_size < 2) { + use_sse4 = false; + puts ("Processor lacks SSE4."); + } + } + + if (!use_sse2) { + unlink (TMPFILE); + if (-1 == system ("grep -i sse2 /proc/cpuinfo > "TMPFILE)) + perror ("system"); + + if (!stat (TMPFILE, &st)) { + if (st.st_size < 2) { + use_sse2 = false; + puts ("Processor lacks SSE2."); + } + } + } +#endif + } else { + printf ("CPU information is not available (/proc/cpuinfo).\n"); + } + + cpu_num = sysconf(_SC_NPROCESSORS_CONF); + printf("System has %d processor(s)\n", cpu_num); + + fflush (stdout); +#endif + + graph = BMP_new (graph_width, graph_height); + graph_init (); + +#if !defined(__arm__) && !defined(__aarch64__) + //------------------------------------------------------------ + // SSE2 sequential reads. + // + if (use_sse2) { + graph_new_line ("Sequential 128-bit reads", RGB_RED); + + newline (); + + i = chunk_index; + while ((chunk_size = chunk_sizes [i++])) { + int amount = do_read (chunk_size, true, false); + + graph_add_point (chunk_size, amount); + } + } + + //------------------------------------------------------------ + // SSE2 random reads. + // + if (use_sse2) { + graph_new_line ("Random 128-bit reads", RGB_MAROON); + + newline (); + srand (time (NULL)); + + i = chunk_index; + while ((chunk_size = chunk_sizes [i++])) { + int amount = do_read (chunk_size, true, true); + + graph_add_point (chunk_size, amount); + } + } + + //------------------------------------------------------------ + // SSE2 sequential writes that do not bypass the caches. + // + if (use_sse2) { + graph_new_line ("Sequential 128-bit cache writes", RGB_PURPLE); + + newline (); + + i = chunk_index; + while ((chunk_size = chunk_sizes [i++])) { + int amount = do_write (chunk_size, SSE2, false); + + graph_add_point (chunk_size, amount); + } + } + + //------------------------------------------------------------ + // SSE2 random writes that do not bypass the caches. + // + if (use_sse2) { + graph_new_line ("Random 128-bit cache writes", RGB_NAVYBLUE); + + newline (); + srand (time (NULL)); + + i = chunk_index; + while ((chunk_size = chunk_sizes [i++])) { + int amount = do_write (chunk_size, SSE2, true); + + graph_add_point (chunk_size, amount); + } + } + + //------------------------------------------------------------ + // SSE2 sequential writes that do bypass the caches. + // + if (use_sse2) { + graph_new_line ("Sequential 128-bit bypassing writes", RGB_DARKORANGE); + + newline (); + + i = chunk_index; + while ((chunk_size = chunk_sizes [i++])) { + int amount = do_write (chunk_size, SSE2_BYPASS, false); + + graph_add_point (chunk_size, amount); + } + } + + //------------------------------------------------------------ + // SSE2 random writes that bypass the caches. + // + if (use_sse2) { + graph_new_line ("Random 128-bit bypassing writes", RGB_LEMONYELLOW); + + newline (); + srand (time (NULL)); + + i = chunk_index; + while ((chunk_size = chunk_sizes [i++])) { + int amount = do_write (chunk_size, SSE2_BYPASS, true); + + graph_add_point (chunk_size, amount); + } + } +#endif + + //------------------------------------------------------------ + // Sequential non-SSE2 reads. + // + newline (); +#if defined(__x86_64__) || defined(__aarch64__) + graph_new_line ("Sequential 64-bit reads", RGB_BLUE); +#else + graph_new_line ("Sequential 32-bit reads", RGB_BLUE); +#endif + + i = chunk_index; + while ((chunk_size = chunk_sizes [i++])) { + int amount = do_read (chunk_size, false, false); + + graph_add_point (chunk_size, amount); + } + + //------------------------------------------------------------ + // Random non-SSE2 reads. + // + newline (); +#if defined(__x86_64__) || defined(__aarch64__) + graph_new_line ("Random 64-bit reads", RGB_CYAN); +#else + graph_new_line ("Random 32-bit reads", RGB_CYAN); +#endif + srand (time (NULL)); + + i = chunk_index; + while ((chunk_size = chunk_sizes [i++])) { + int amount = do_read (chunk_size, false, true); + + graph_add_point (chunk_size, amount); + } + + //------------------------------------------------------------ + // Sequential non-SSE2 writes. + // +#if defined(__x86_64__) || defined(__aarch64__) + graph_new_line ("Sequential 64-bit writes", RGB_DARKGREEN); +#else + graph_new_line ("Sequential 32-bit writes", RGB_DARKGREEN); +#endif + + newline (); + + i = chunk_index; + while ((chunk_size = chunk_sizes [i++])) { + int amount = do_write (chunk_size, NO_SSE2, false); + + graph_add_point (chunk_size, amount); + } + + //------------------------------------------------------------ + // Random non-SSE2 writes. + // +#if defined(__x86_64__) || defined(__aarch64__) + graph_new_line ("Random 64-bit writes", RGB_GREEN); +#else + graph_new_line ("Random 32-bit writes", RGB_GREEN); +#endif + + newline (); + srand (time (NULL)); + + i = chunk_index; + while ((chunk_size = chunk_sizes [i++])) { + int amount = do_write (chunk_size, NO_SSE2, true); + + graph_add_point (chunk_size, amount); + } + +#if !defined(__arm__) && !defined(__aarch64__) + //------------------------------------------------------------ + // SSE2 sequential copy. + // + if (use_sse2) { + graph_new_line ("Sequential 128-bit copy", 0x8f8844); + + newline (); + + i = 0; + while ((chunk_size = chunk_sizes [i++])) { + int amount = do_copy (chunk_size, SSE2); + + graph_add_point (chunk_size, amount); + } + } +#endif + + //------------------------------------------------------------ + // Register to register. + // + newline (); + register_test (); + + //------------------------------------------------------------ + // Stack to/from register. + // + newline (); + stack_test (); + + //------------------------------------------------------------ + // C library performance. + // + newline (); + library_test (); + + //------------------------------------------------------------ + // Framebuffer read & write. + // +#if defined(__linux__) && defined(FBIOGET_FSCREENINFO) + newline (); + fb_readwrite (true); +#endif + +#if defined(__WIN32__) && (defined(__arm__) || defined(__aarch64__)) + MessageBoxW (0, msg, APPNAME, 0); + + FILE *of = fopen ("bandwidth.log", "w"); + if (of) { + dump (of); + fclose (of); + } +#else + flush (); +#endif + + graph_make (); + + BMP_write (graph, "bandwidth.bmp"); + BMP_delete (graph); +#if defined(__linux__) || defined(__CYGWIN__) || defined(__APPLE__) + puts ("\nWrote graph to bandwidth.bmp."); + puts (""); + puts ("Done."); +#endif + + return 0; +}
diff --git a/make.inc b/make.inc new file mode 100644 index 0000000..e8eccab --- /dev/null +++ b/make.inc
@@ -0,0 +1,65 @@ +## +## unit_test/linux/benchmark/bandwidth/make.inc +## +## History: +## 2012/05/31 - [Cao Rongrong] Created file +## +## Copyright (C) 2011-2015, Ambarella, Inc. +## +## All rights reserved. No Part of this file may be reproduced, stored +## in a retrieval system, or transmitted, in any form, or by any means, +## electronic, mechanical, photocopying, recording, or otherwise, +## without the prior consent of Ambarella, Inc. +## +ifeq ($(BUILD_AMBARELLA_UNIT_TESTS_BANDWIDTH), y) + +LOCAL_PATH := $(call my-dir) + +### +include $(CLEAR_VARS) + +LOCAL_TARGET := bandwidth-arm +ifeq ($(CPU_ARCH), arm64) +LOCAL_SRCS := $(LOCAL_PATH)/routinesARM64.S +else +LOCAL_SRCS := $(LOCAL_PATH)/routinesARM.S +endif +LOCAL_SRCS += $(LOCAL_PATH)/main.c $(LOCAL_PATH)/BMP.c +LOCAL_CFLAGS := -O3 + +include $(BUILD_APP) + +.PHONY: $(LOCAL_TARGET) + +$(LOCAL_TARGET): $(LOCAL_MODULE) + @mkdir -p $(UNIT_TEST_PATH)/ + @cp -dpRf $< $(UNIT_TEST_PATH)/ + @echo "Build $@ Done." + +$(call add-target-into-build, $(LOCAL_TARGET)) + +### +include $(CLEAR_VARS) + +LOCAL_TARGET := bandwidth-arm-thread +ifeq ($(CPU_ARCH), arm64) +LOCAL_SRCS := $(LOCAL_PATH)/routinesARM64.S +else +LOCAL_SRCS := $(LOCAL_PATH)/routinesARM.S +endif +LOCAL_SRCS += $(LOCAL_PATH)/main_thread.c $(LOCAL_PATH)/BMP.c +LOCAL_CFLAGS := -O3 +LOCAL_LDFLAGS := -lpthread + +include $(BUILD_APP) + +.PHONY: $(LOCAL_TARGET) + +$(LOCAL_TARGET): $(LOCAL_MODULE) + @mkdir -p $(UNIT_TEST_PATH)/ + @cp -dpRf $< $(UNIT_TEST_PATH)/ + @echo "Build $@ Done." + +$(call add-target-into-build, $(LOCAL_TARGET)) + +endif
diff --git a/routines32.asm b/routines32.asm new file mode 100644 index 0000000..2f6f485 --- /dev/null +++ b/routines32.asm
@@ -0,0 +1,1636 @@ + +; ============================================================================ +; bandwidth 0.23, a benchmark to estimate memory transfer bandwidth. +; Copyright (C) 2005-2010 by Zack T Smith. +; +; This program is free software; you can redistribute it and/or modify +; it under the terms of the GNU General Public License as published by +; the Free Software Foundation; either version 2 of the License, or +; (at your option) any later version. +; +; This program is distributed in the hope that it will be useful, +; but WITHOUT ANY WARRANTY; without even the implied warranty of +; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +; GNU General Public License for more details. +; +; You should have received a copy of the GNU General Public License +; along with this program; if not, write to the Free Software +; Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA +; +; The author may be reached at fbui@comcast.net. +; ============================================================================= + +bits 32 +cpu prescott + +; Cygwin requires the underbar-prefixed symbols. +global _WriterSSE2 +global WriterSSE2 + +global _ReaderSSE2 +global ReaderSSE2 + +global _RandomReaderSSE2 +global RandomReaderSSE2 + +global _WriterSSE2_bypass +global WriterSSE2_bypass + +global _RandomWriterSSE2_bypass +global RandomWriterSSE2_bypass + +global Reader +global _Reader + +global Writer +global _Writer + +global RandomReader +global _RandomReader + +global RandomWriter +global _RandomWriter + +global RandomWriterSSE2 +global _RandomWriterSSE2 + +global has_sse2 +global _has_sse2 + +global CopySSE +global _CopySSE + +global RegisterToRegister +global _RegisterToRegister + +global VectorToVector +global _VectorToVector + +global RegisterToVector +global _RegisterToVector + +global VectorToRegister +global _VectorToRegister + +global Register8ToVector +global Register16ToVector +global Register32ToVector +global Register64ToVector +global Vector8ToRegister +global Vector16ToRegister +global Vector32ToRegister +global Vector64ToRegister + +global _Register8ToVector +global _Register16ToVector +global _Register32ToVector +global _Register64ToVector +global _Vector8ToRegister +global _Vector16ToRegister +global _Vector32ToRegister +global _Vector64ToRegister + +global StackReader +global _StackReader + +global StackWriter +global _StackWriter + + section .text + +;------------------------------------------------------------------------------ +; Name: Reader +; Purpose: Reads 32-bit values sequentially from an area of memory. +; Params: +; [esp+4] = ptr to memory area +; [esp+8] = length in bytes +; [esp+12] = loops +;------------------------------------------------------------------------------ +Reader: +_Reader: + push ebx + push ecx + push edx + + mov ecx, [esp+12+12] ; loops to do. + + mov edx, [esp+4+12] ; ptr to memory chunk. + mov ebx, edx ; ebx = limit in memory + add ebx, [esp+8+12] + +.L1: + mov edx, [esp+4+12] + +.L2: + mov eax, [edx] + mov eax, [4+edx] + mov eax, [8+edx] + mov eax, [12+edx] + mov eax, [16+edx] + mov eax, [20+edx] + mov eax, [24+edx] + mov eax, [28+edx] + mov eax, [32+edx] + mov eax, [36+edx] + mov eax, [40+edx] + mov eax, [44+edx] + mov eax, [48+edx] + mov eax, [52+edx] + mov eax, [56+edx] + mov eax, [60+edx] + mov eax, [64+edx] + mov eax, [68+edx] + mov eax, [72+edx] + mov eax, [76+edx] + mov eax, [80+edx] + mov eax, [84+edx] + mov eax, [88+edx] + mov eax, [92+edx] + mov eax, [96+edx] + mov eax, [100+edx] + mov eax, [104+edx] + mov eax, [108+edx] + mov eax, [112+edx] + mov eax, [116+edx] + mov eax, [120+edx] + mov eax, [124+edx] + + mov eax, [edx+128] + mov eax, [edx+132] + mov eax, [edx+136] + mov eax, [edx+140] + mov eax, [edx+144] + mov eax, [edx+148] + mov eax, [edx+152] + mov eax, [edx+156] + mov eax, [edx+160] + mov eax, [edx+164] + mov eax, [edx+168] + mov eax, [edx+172] + mov eax, [edx+176] + mov eax, [edx+180] + mov eax, [edx+184] + mov eax, [edx+188] + mov eax, [edx+192] + mov eax, [edx+196] + mov eax, [edx+200] + mov eax, [edx+204] + mov eax, [edx+208] + mov eax, [edx+212] + mov eax, [edx+216] + mov eax, [edx+220] + mov eax, [edx+224] + mov eax, [edx+228] + mov eax, [edx+232] + mov eax, [edx+236] + mov eax, [edx+240] + mov eax, [edx+244] + mov eax, [edx+248] + mov eax, [edx+252] + + add edx, 256 + cmp edx, ebx + jb .L2 + + sub ecx, 1 + jnz .L1 + + pop edx + pop ecx + pop ebx + ret + + +;------------------------------------------------------------------------------ +; Name: Writer +; Purpose: Writes 32-bit value sequentially to an area of memory. +; Params: +; [esp+4] = ptr to memory area +; [esp+8] = length in bytes +; [esp+12] = loops +; [esp+16] = long to write +;------------------------------------------------------------------------------ +Writer: +_Writer: + push ebx + push ecx + push edx + + mov ecx, [esp+12+12] + mov eax, [esp+16+12] + + mov edx, [esp+4+12] ; edx = ptr to chunk + mov ebx, edx + add ebx, [esp+8+12] ; ebx = limit in memory + +.L1: + mov edx, [esp+4+12] + +.L2: + mov [edx], eax + mov [4+edx], eax + mov [8+edx], eax + mov [12+edx], eax + mov [16+edx], eax + mov [20+edx], eax + mov [24+edx], eax + mov [28+edx], eax + mov [32+edx], eax + mov [36+edx], eax + mov [40+edx], eax + mov [44+edx], eax + mov [48+edx], eax + mov [52+edx], eax + mov [56+edx], eax + mov [60+edx], eax + mov [64+edx], eax + mov [68+edx], eax + mov [72+edx], eax + mov [76+edx], eax + mov [80+edx], eax + mov [84+edx], eax + mov [88+edx], eax + mov [92+edx], eax + mov [96+edx], eax + mov [100+edx], eax + mov [104+edx], eax + mov [108+edx], eax + mov [112+edx], eax + mov [116+edx], eax + mov [120+edx], eax + mov [124+edx], eax + + mov [edx+128], eax + mov [edx+132], eax + mov [edx+136], eax + mov [edx+140], eax + mov [edx+144], eax + mov [edx+148], eax + mov [edx+152], eax + mov [edx+156], eax + mov [edx+160], eax + mov [edx+164], eax + mov [edx+168], eax + mov [edx+172], eax + mov [edx+176], eax + mov [edx+180], eax + mov [edx+184], eax + mov [edx+188], eax + mov [edx+192], eax + mov [edx+196], eax + mov [edx+200], eax + mov [edx+204], eax + mov [edx+208], eax + mov [edx+212], eax + mov [edx+216], eax + mov [edx+220], eax + mov [edx+224], eax + mov [edx+228], eax + mov [edx+232], eax + mov [edx+236], eax + mov [edx+240], eax + mov [edx+244], eax + mov [edx+248], eax + mov [edx+252], eax + + add edx, 256 + cmp edx, ebx + jb .L2 + + sub ecx, 1 + jnz .L1 + + pop edx + pop ecx + pop ebx + ret + + +;------------------------------------------------------------------------------ +; Name: has_sse2 +; +has_sse2: +_has_sse2: + push ebx + push ecx + push edx + mov eax, 1 + cpuid + xor eax, eax + test edx, 0x4000000 + setnz al + pop edx + pop ecx + pop ebx + ret + + +;------------------------------------------------------------------------------ +; Name: ReaderSSE2 +; Purpose: Reads 128-bit values sequentially from an area of memory. +; Params: [esp+4] = ptr to memory area +; [esp+8] = length in bytes +; [esp+12] = loops +;------------------------------------------------------------------------------ +ReaderSSE2: +_ReaderSSE2: + push ebx + push ecx + + mov ecx, [esp+12+8] + + mov eax, [esp+4+8] + mov ebx, eax + add ebx, [esp+8+8] ; ebx points to end. + +.L1: + mov eax, [esp+4+8] + +.L2: + movdqa xmm0, [eax] ; Read aligned @ 16-byte boundary. + movdqa xmm0, [16+eax] + movdqa xmm0, [32+eax] + movdqa xmm0, [48+eax] + movdqa xmm0, [64+eax] + movdqa xmm0, [80+eax] + movdqa xmm0, [96+eax] + movdqa xmm0, [112+eax] + + movdqa xmm0, [128+eax] + movdqa xmm0, [144+eax] + movdqa xmm0, [160+eax] + movdqa xmm0, [176+eax] + movdqa xmm0, [192+eax] + movdqa xmm0, [208+eax] + movdqa xmm0, [224+eax] + movdqa xmm0, [240+eax] + + add eax, 256 + cmp eax, ebx + jb .L2 + + sub ecx, 1 + jnz .L1 + + pop ecx + pop ebx + ret + +;------------------------------------------------------------------------------ +; Name: WriterSSE2 +; Purpose: Write 128-bit values sequentially from an area of memory. +; Params: [esp+4] = ptr to memory area +; [esp+8] = length in bytes +; [esp+12] = loops +; [esp+16] = value (ignored) +;------------------------------------------------------------------------------ +WriterSSE2: +_WriterSSE2: + push ebx + push ecx + + mov eax, [esp+16+8] + movd xmm0, eax ; Create a 128-bit replication of the 32-bit + movd xmm1, eax ; value that was provided. + movd xmm2, eax + movd xmm3, eax + pslldq xmm1, 32 + pslldq xmm2, 64 + pslldq xmm3, 96 + por xmm0, xmm1 + por xmm0, xmm2 + por xmm0, xmm3 + + mov ecx, [esp+12+8] + + mov eax, [esp+4+8] + mov ebx, eax + add ebx, [esp+8+8] ; ebx points to end. + +.L1: + mov eax, [esp+4+8] + +.L2: + movdqa [eax], xmm0 + movdqa [16+eax], xmm0 + movdqa [32+eax], xmm0 + movdqa [48+eax], xmm0 + movdqa [64+eax], xmm0 + movdqa [80+eax], xmm0 + movdqa [96+eax], xmm0 + movdqa [112+eax], xmm0 + + movdqa [128+eax], xmm0 + movdqa [144+eax], xmm0 + movdqa [160+eax], xmm0 + movdqa [176+eax], xmm0 + movdqa [192+eax], xmm0 + movdqa [208+eax], xmm0 + movdqa [224+eax], xmm0 + movdqa [240+eax], xmm0 + + add eax, 256 + cmp eax, ebx + jb .L2 + + sub ecx, 1 + jnz .L1 + + pop ecx + pop ebx + ret + +;------------------------------------------------------------------------------ +; Name: WriterSSE2_bypass +; Purpose: Write 128-bit values sequentially from an area of memory, +; bypassing the cache. +; Params: [esp+4] = ptr to memory area +; [esp+8] = length in bytes +; [esp+12] = loops +; [esp+16] = value (ignored) +;------------------------------------------------------------------------------ +WriterSSE2_bypass: +_WriterSSE2_bypass: + push ebx + push ecx + + mov eax, [esp+16+8] + movd xmm0, eax ; Create a 128-bit replication of the 32-bit + movd xmm1, eax ; value that was provided. + movd xmm2, eax + movd xmm3, eax + pslldq xmm1, 32 + pslldq xmm2, 64 + pslldq xmm3, 96 + por xmm0, xmm1 + por xmm0, xmm2 + por xmm0, xmm3 + + mov ecx, [esp+12+8] + + mov eax, [esp+4+8] + mov ebx, eax + add ebx, [esp+8+8] ; ebx points to end. + +.L1: + mov eax, [esp+4+8] + +.L2: + movntdq [eax], xmm0 ; Write bypassing cache. + movntdq [16+eax], xmm0 + movntdq [32+eax], xmm0 + movntdq [48+eax], xmm0 + movntdq [64+eax], xmm0 + movntdq [80+eax], xmm0 + movntdq [96+eax], xmm0 + movntdq [112+eax], xmm0 + + movntdq [128+eax], xmm0 + movntdq [144+eax], xmm0 + movntdq [160+eax], xmm0 + movntdq [176+eax], xmm0 + movntdq [192+eax], xmm0 + movntdq [208+eax], xmm0 + movntdq [224+eax], xmm0 + movntdq [240+eax], xmm0 + + add eax, 256 + cmp eax, ebx + jb .L2 + + sub ecx, 1 + jnz .L1 + + pop ecx + pop ebx + ret + +;------------------------------------------------------------------------------ +; Name: RandomReader +; Purpose: Reads 32-bit values randomly from an area of memory. +; Params: +; [esp+4] = ptr to array of chunk pointers +; [esp+8] = # of 128-byte chunks +; [esp+12] = loops +;------------------------------------------------------------------------------ +RandomReader: +_RandomReader: + push ebx + push ecx + push edx + + mov ecx, [esp+12+12] ; loops to do. + +.L0: + mov ebx, [esp+8+12] ; # chunks to do + +.L1: + sub ebx, 1 + jc .L2 + + mov edx, [esp+4+12] ; get ptr to memory chunk. + mov edx, [edx + 4*ebx] + + mov eax, [edx+160] + mov eax, [edx+232] + mov eax, [edx+224] + mov eax, [96+edx] + mov eax, [edx+164] + mov eax, [76+edx] + mov eax, [100+edx] + mov eax, [edx+220] + mov eax, [edx+248] + mov eax, [104+edx] + mov eax, [4+edx] + mov eax, [edx+136] + mov eax, [112+edx] + mov eax, [edx+200] + mov eax, [12+edx] + mov eax, [edx+128] + mov eax, [edx+148] + mov eax, [edx+196] + mov eax, [edx+216] + mov eax, [edx] + mov eax, [84+edx] + mov eax, [edx+140] + mov eax, [edx+204] + mov eax, [edx+184] + mov eax, [124+edx] + mov eax, [48+edx] + mov eax, [64+edx] + mov eax, [edx+212] + mov eax, [edx+240] + mov eax, [edx+236] + mov eax, [24+edx] + mov eax, [edx+252] + mov eax, [68+edx] + mov eax, [20+edx] + mov eax, [72+edx] + mov eax, [32+edx] + mov eax, [28+edx] + mov eax, [52+edx] + mov eax, [edx+244] + mov eax, [edx+180] + mov eax, [80+edx] + mov eax, [60+edx] + mov eax, [8+edx] + mov eax, [56+edx] + mov eax, [edx+208] + mov eax, [edx+228] + mov eax, [40+edx] + mov eax, [edx+172] + mov eax, [120+edx] + mov eax, [edx+176] + mov eax, [108+edx] + mov eax, [edx+132] + mov eax, [16+edx] + mov eax, [44+edx] + mov eax, [92+edx] + mov eax, [edx+168] + mov eax, [edx+152] + mov eax, [edx+156] + mov eax, [edx+188] + mov eax, [36+edx] + mov eax, [88+edx] + mov eax, [116+edx] + mov eax, [edx+192] + mov eax, [edx+144] + + jmp .L1 + +.L2: + sub ecx, 1 + jnz .L0 + + pop edx + pop ecx + pop ebx + ret + + +;------------------------------------------------------------------------------ +; Name: RandomReaderSSE2 +; Purpose: Reads 128-bit values sequentially from an area of memory. +; Params: +; [esp+4] = ptr to array of chunk pointers +; [esp+8] = # of 128-byte chunks +; [esp+12] = loops +;------------------------------------------------------------------------------ +RandomReaderSSE2: +_RandomReaderSSE2: + push ebx + push ecx + push edx + + mov ecx, [esp+12+12] ; loops to do. + +.L0: + mov ebx, [esp+8+12] ; # chunks to do + +.L1: + sub ebx, 1 + jc .L2 + + mov edx, [esp+4+12] ; get ptr to memory chunk. + mov edx, [edx + 4*ebx] + +; Read aligned @ 16-byte boundary. + movdqa xmm0, [240+edx] + movdqa xmm0, [128+edx] + movdqa xmm0, [64+edx] + movdqa xmm0, [208+edx] + movdqa xmm0, [112+edx] + movdqa xmm0, [176+edx] + movdqa xmm0, [144+edx] + movdqa xmm0, [edx] + movdqa xmm0, [96+edx] + movdqa xmm0, [16+edx] + movdqa xmm0, [192+edx] + movdqa xmm0, [160+edx] + movdqa xmm0, [32+edx] + movdqa xmm0, [48+edx] + movdqa xmm0, [224+edx] + movdqa xmm0, [80+edx] + + jmp .L1 + +.L2: + sub ecx, 1 + jnz .L0 + + pop edx + pop ecx + pop ebx + ret + + +;------------------------------------------------------------------------------ +; Name: RandomWriter +; Purpose: Writes 32-bit value sequentially to an area of memory. +; Params: +; [esp+4] = ptr to memory area +; [esp+8] = length in bytes +; [esp+12] = loops +; [esp+16] = long to write +;------------------------------------------------------------------------------ +RandomWriter: +_RandomWriter: + push ebx + push ecx + push edx + + mov eax, [esp+16+12] ; get datum. + mov ecx, [esp+12+12] ; loops to do. + +.L0: + mov ebx, [esp+8+12] ; # chunks to do + +.L1: + sub ebx, 1 + jc .L2 + + mov edx, [esp+4+12] ; get ptr to memory chunk. + mov edx, [edx + 4*ebx] + + mov [edx+212], eax + mov [edx+156], eax + mov [edx+132], eax + mov [20+edx], eax + mov [edx+172], eax + mov [edx+196], eax + mov [edx+248], eax + mov [edx], eax + mov [edx+136], eax + mov [edx+228], eax + mov [edx+160], eax + mov [80+edx], eax + mov [76+edx], eax + mov [32+edx], eax + mov [64+edx], eax + mov [68+edx], eax + mov [120+edx], eax + mov [edx+216], eax + mov [124+edx], eax + mov [28+edx], eax + mov [edx+152], eax + mov [36+edx], eax + mov [edx+220], eax + mov [edx+188], eax + mov [48+edx], eax + mov [104+edx], eax + mov [72+edx], eax + mov [96+edx], eax + mov [edx+184], eax + mov [112+edx], eax + mov [edx+236], eax + mov [edx+224], eax + mov [edx+252], eax + mov [88+edx], eax + mov [edx+180], eax + mov [60+edx], eax + mov [24+edx], eax + mov [edx+192], eax + mov [edx+164], eax + mov [edx+204], eax + mov [44+edx], eax + mov [edx+168], eax + mov [92+edx], eax + mov [edx+208], eax + mov [8+edx], eax + mov [edx+144], eax + mov [edx+148], eax + mov [edx+128], eax + mov [52+edx], eax + mov [4+edx], eax + mov [108+edx], eax + mov [12+edx], eax + mov [56+edx], eax + mov [edx+200], eax + mov [edx+232], eax + mov [16+edx], eax + mov [edx+244], eax + mov [40+edx], eax + mov [edx+140], eax + mov [84+edx], eax + mov [100+edx], eax + mov [116+edx], eax + mov [edx+176], eax + mov [edx+240], eax + + jmp .L1 + +.L2: + sub ecx, 1 + jnz .L0 + + pop edx + pop ecx + pop ebx + ret + +;------------------------------------------------------------------------------ +; Name: RandomWriterSSE2 +; Purpose: Writes 128-bit value randomly to an area of memory. +; Params: +; [esp+4] = ptr to memory area +; [esp+8] = length in bytes +; [esp+12] = loops +; [esp+16] = long to write +;------------------------------------------------------------------------------ +RandomWriterSSE2: +_RandomWriterSSE2: + push ebx + push ecx + push edx + + mov eax, [esp+16+8] + movd xmm0, eax ; Create a 128-bit replication of the 32-bit + movd xmm1, eax ; value that was provided. + movd xmm2, eax + movd xmm3, eax + pslldq xmm1, 32 + pslldq xmm2, 64 + pslldq xmm3, 96 + por xmm0, xmm1 + por xmm0, xmm2 + por xmm0, xmm3 + + mov ecx, [esp+12+12] ; loops to do. + +.L0: + mov ebx, [esp+8+12] ; # chunks to do + +.L1: + sub ebx, 1 + jc .L2 + + mov edx, [esp+4+12] ; get ptr to memory chunk. + mov edx, [edx + 4*ebx] + + movdqa [64+edx], xmm0 + movdqa [208+edx], xmm0 + movdqa [128+edx], xmm0 + movdqa [112+edx], xmm0 + movdqa [176+edx], xmm0 + movdqa [144+edx], xmm0 + movdqa [edx], xmm0 + movdqa [96+edx], xmm0 + movdqa [48+edx], xmm0 + movdqa [16+edx], xmm0 + movdqa [192+edx], xmm0 + movdqa [160+edx], xmm0 + movdqa [32+edx], xmm0 + movdqa [240+edx], xmm0 + movdqa [224+edx], xmm0 + movdqa [80+edx], xmm0 + + jmp .L1 + +.L2: + sub ecx, 1 + jnz .L0 + + pop edx + pop ecx + pop ebx + ret + + +;------------------------------------------------------------------------------ +; Name: RandomWriterSSE2_bypass +; Purpose: Writes 128-bit value randomly into memory, bypassing caches. +; Params: +; [esp+4] = ptr to memory area +; [esp+8] = length in bytes +; [esp+12] = loops +; [esp+16] = long to write +;------------------------------------------------------------------------------ +RandomWriterSSE2_bypass: +_RandomWriterSSE2_bypass: + push ebx + push ecx + push edx + + mov eax, [esp+16+8] + movd xmm0, eax ; Create a 128-bit replication of the 32-bit + movd xmm1, eax ; value that was provided. + movd xmm2, eax + movd xmm3, eax + pslldq xmm1, 32 + pslldq xmm2, 64 + pslldq xmm3, 96 + por xmm0, xmm1 + por xmm0, xmm2 + por xmm0, xmm3 + + mov ecx, [esp+12+12] ; loops to do. + +.L0: + mov ebx, [esp+8+12] ; # chunks to do + +.L1: + sub ebx, 1 + jc .L2 + + mov edx, [esp+4+12] ; get ptr to memory chunk. + mov edx, [edx + 4*ebx] + + movntdq [128+edx], xmm0 + movntdq [240+edx], xmm0 + movntdq [112+edx], xmm0 + movntdq [64+edx], xmm0 + movntdq [176+edx], xmm0 + movntdq [144+edx], xmm0 + movntdq [edx], xmm0 + movntdq [208+edx], xmm0 + movntdq [80+edx], xmm0 + movntdq [96+edx], xmm0 + movntdq [48+edx], xmm0 + movntdq [16+edx], xmm0 + movntdq [192+edx], xmm0 + movntdq [160+edx], xmm0 + movntdq [224+edx], xmm0 + movntdq [32+edx], xmm0 + + jmp .L1 + +.L2: + sub ecx, 1 + jnz .L0 + + pop edx + pop ecx + pop ebx + ret + +;------------------------------------------------------------------------------ +; Name: RegisterToRegister +; Purpose: Reads/writes 32-bit values between registers of +; the main register set. +; Params: +; dword [esp+4] = loops +;------------------------------------------------------------------------------ +RegisterToRegister: +_RegisterToRegister: + push ebx + push ecx + + mov ecx, [esp+4+8] ; loops to do. + +.L1: + mov eax, ebx ; 64 transfers by 4 bytes = 256 bytes + mov eax, ecx + mov eax, edx + mov eax, esi + mov eax, edi + mov eax, ebp + mov eax, esp + mov eax, ebx + mov eax, ebx + mov eax, ecx + mov eax, edx + mov eax, esi + mov eax, edi + mov eax, ebp + mov eax, esp + mov eax, ebx + mov eax, ebx + mov eax, ecx + mov eax, edx + mov eax, esi + mov eax, edi + mov eax, ebp + mov eax, esp + mov eax, ebx + mov eax, ebx + mov eax, ecx + mov eax, edx + mov eax, esi + mov eax, edi + mov eax, ebp + mov eax, esp + mov eax, ebx + + mov ebx, eax + mov ebx, ecx + mov ebx, edx + mov ebx, esi + mov ebx, edi + mov ebx, ebp + mov ebx, esp + mov ebx, eax + mov ebx, eax + mov ebx, ecx + mov ebx, edx + mov ebx, esi + mov ebx, edi + mov ebx, ebp + mov ebx, esp + mov ebx, eax + mov ebx, eax + mov ebx, ecx + mov ebx, edx + mov ebx, esi + mov ebx, edi + mov ebx, ebp + mov ebx, esp + mov ebx, eax + mov ebx, eax + mov ebx, ecx + mov ebx, edx + mov ebx, esi + mov ebx, edi + mov ebx, ebp + mov ebx, esp + mov ebx, eax + + dec ecx + jnz .L1 + + pop ecx + pop ebx + ret + +;------------------------------------------------------------------------------ +; Name: VectorToVector +; Purpose: Reads/writes 128-bit values between registers of +; the vector register set, in this case XMM. +; (I don't have access to anything with YMM.) +; Params: dword [esp + 4] = count. +;------------------------------------------------------------------------------ +VectorToVector: +_VectorToVector: + mov eax, [esp + 4] +.L1: + movdqa xmm0, xmm1 + movdqa xmm0, xmm2 + movdqa xmm0, xmm3 + movdqa xmm2, xmm0 + movdqa xmm1, xmm2 + movdqa xmm2, xmm1 + movdqa xmm0, xmm3 + movdqa xmm3, xmm1 + + movdqa xmm3, xmm2 + movdqa xmm1, xmm3 + movdqa xmm2, xmm1 + movdqa xmm0, xmm1 + movdqa xmm1, xmm2 + movdqa xmm0, xmm1 + movdqa xmm0, xmm3 + movdqa xmm3, xmm0 + + dec eax + jnz .L1 + ret + +;------------------------------------------------------------------------------ +; Name: RegisterToVector +; Purpose: Writes 32-bit main register values into 128-bit vector register +; clearing the upper unused bits. +; Params: dword [esp + 4] = count. +;------------------------------------------------------------------------------ +RegisterToVector: +_RegisterToVector: + mov eax, [esp + 4] + add eax, eax ; Double # of loops. +.L1: + movd xmm1, eax ; 32 transfers of 4 bytes = 128 bytes + movd xmm2, eax + movd xmm3, eax + movd xmm0, eax + movd xmm1, eax + movd xmm2, eax + movd xmm3, eax + movd xmm0, eax + + movd xmm1, eax + movd xmm3, eax + movd xmm2, eax + movd xmm0, eax + movd xmm1, eax + movd xmm2, eax + movd xmm3, eax + movd xmm0, eax + + movd xmm0, eax + movd xmm2, eax + movd xmm0, eax + movd xmm3, eax + movd xmm1, eax + movd xmm3, eax + movd xmm2, eax + movd xmm0, eax + + movd xmm0, eax + movd xmm3, eax + movd xmm1, eax + movd xmm2, eax + movd xmm0, eax + movd xmm2, eax + movd xmm3, eax + movd xmm0, eax + + dec eax + jnz .L1 + ret + +;------------------------------------------------------------------------------ +; Name: VectorToRegister +; Purpose: Writes lowest 32 bits of vector registers into 32-bit main +; register. +; Params: dword [esp + 4] = count. +;------------------------------------------------------------------------------ +VectorToRegister: +_VectorToRegister: + mov eax, [esp + 4] + add eax, eax ; Double # of loops. + push ebx +.L1: + movd ebx, xmm1 ; 4 bytes per transfer therefore need 64 + movd ebx, xmm2 ; to transfer 256 bytes. + movd ebx, xmm3 + movd ebx, xmm0 + movd ebx, xmm1 + movd ebx, xmm2 + movd ebx, xmm3 + movd ebx, xmm0 + + movd ebx, xmm1 + movd ebx, xmm3 + movd ebx, xmm2 + movd ebx, xmm0 + movd ebx, xmm1 + movd ebx, xmm2 + movd ebx, xmm3 + movd ebx, xmm0 + + movd ebx, xmm0 + movd ebx, xmm2 + movd ebx, xmm0 + movd ebx, xmm3 + movd ebx, xmm1 + movd ebx, xmm3 + movd ebx, xmm2 + movd ebx, xmm0 + + movd ebx, xmm0 + movd ebx, xmm3 + movd ebx, xmm1 + movd ebx, xmm2 + movd ebx, xmm0 + movd ebx, xmm2 + movd ebx, xmm3 + movd ebx, xmm0 + + dec eax + jnz .L1 + + pop ebx + ret + +;------------------------------------------------------------------------------ +; Name: StackReader +; Purpose: Reads 32-bit values off the stack into registers of +; the main register set, effectively testing L1 cache access +; *and* effective-address calculation speed. +; Params: +; dword [esp+4] = loops +;------------------------------------------------------------------------------ +StackReader: +_StackReader: + push ebx + push ecx + + mov ecx, [esp+4+8] ; loops to do. + + push dword 7000 ; [esp+24] + push dword 6000 ; [esp+20] + push dword 5000 ; [esp+16] + push dword 4000 ; [esp+12] + push dword 3000 ; [esp+8] + push dword 2000 ; [esp+4] + push dword 1000 ; [esp] + +.L1: + mov eax, [esp] + mov eax, [esp+8] + mov eax, [esp+12] + mov eax, [esp+16] + mov eax, [esp+20] + mov eax, [esp+4] + mov eax, [esp+24] + mov eax, [esp] + mov eax, [esp] + mov eax, [esp+8] + mov eax, [esp+12] + mov eax, [esp+16] + mov eax, [esp+20] + mov eax, [esp+4] + mov eax, [esp+24] + mov eax, [esp] + mov eax, [esp] + mov eax, [esp+8] + mov eax, [esp+12] + mov eax, [esp+16] + mov eax, [esp+20] + mov eax, [esp+4] + mov eax, [esp+24] + mov eax, [esp+4] + mov eax, [esp+4] + mov eax, [esp+8] + mov eax, [esp+12] + mov eax, [esp+16] + mov eax, [esp+20] + mov eax, [esp+4] + mov eax, [esp+24] + mov eax, [esp+4] + + mov ebx, [esp] + mov ebx, [esp+8] + mov ebx, [esp+12] + mov ebx, [esp+16] + mov ebx, [esp+20] + mov ebx, [esp+4] + mov ebx, [esp+24] + mov ebx, [esp] + mov ebx, [esp] + mov ebx, [esp+8] + mov ebx, [esp+12] + mov ebx, [esp+16] + mov ebx, [esp+20] + mov ebx, [esp+4] + mov ebx, [esp+24] + mov ebx, [esp] + mov ebx, [esp] + mov ebx, [esp+8] + mov ebx, [esp+12] + mov ebx, [esp+16] + mov ebx, [esp+20] + mov ebx, [esp+4] + mov ebx, [esp+24] + mov ebx, [esp+4] + mov ebx, [esp+4] + mov ebx, [esp+8] + mov ebx, [esp+12] + mov ebx, [esp+16] + mov ebx, [esp+20] + mov ebx, [esp+4] + mov ebx, [esp+24] + mov ebx, [esp+4] + + dec ecx + jnz .L1 + + add esp, 28 + + pop ecx + pop ebx + ret + +;------------------------------------------------------------------------------ +; Name: StackWriter +; Purpose: Writes 32-bit values into the stack from registers of +; the main register set, effectively testing L1 cache access +; *and* effective-address calculation speed. +; Params: +; dword [esp+4] = loops +;------------------------------------------------------------------------------ +StackWriter: +_StackWriter: + push ebx + push ecx + + mov ecx, [esp+4+8] ; loops to do. + + push dword 7000 ; [esp+24] + push dword 6000 ; [esp+20] + push dword 5000 ; [esp+16] + push dword 4000 ; [esp+12] + push dword 3000 ; [esp+8] + push dword 2000 ; [esp+4] + push dword 1000 ; [esp] + + xor eax, eax + mov ebx, 0xffffffff + +.L1: + mov [esp], eax + mov [esp+8], eax + mov [esp+12], eax + mov [esp+16], eax + mov [esp+20], eax + mov [esp+4], eax + mov [esp+24], eax + mov [esp], eax + mov [esp], eax + mov [esp+8], eax + mov [esp+12], eax + mov [esp+16], eax + mov [esp+20], eax + mov [esp+4], eax + mov [esp+24], eax + mov [esp], eax + mov [esp], eax + mov [esp+8], eax + mov [esp+12], eax + mov [esp+16], eax + mov [esp+20], eax + mov [esp+4], eax + mov [esp+24], eax + mov [esp+4], eax + mov [esp+4], eax + mov [esp+8], eax + mov [esp+12], eax + mov [esp+16], eax + mov [esp+20], eax + mov [esp+4], eax + mov [esp+24], eax + mov [esp+4], eax + + mov [esp], ebx + mov [esp+8], ebx + mov [esp+12], ebx + mov [esp+16], ebx + mov [esp+20], ebx + mov [esp+4], ebx + mov [esp+24], ebx + mov [esp], ebx + mov [esp], ebx + mov [esp+8], ebx + mov [esp+12], ebx + mov [esp+16], ebx + mov [esp+20], ebx + mov [esp+4], ebx + mov [esp+24], ebx + mov [esp], ebx + mov [esp], ebx + mov [esp+8], ebx + mov [esp+12], ebx + mov [esp+16], ebx + mov [esp+20], ebx + mov [esp+4], ebx + mov [esp+24], ebx + mov [esp+4], ebx + mov [esp+4], ebx + mov [esp+8], ebx + mov [esp+12], ebx + mov [esp+16], ebx + mov [esp+20], ebx + mov [esp+4], ebx + mov [esp+24], ebx + mov [esp+4], ebx + + sub ecx, 1 + jnz .L1 + + add esp, 28 + + pop ecx + pop ebx + ret + +;------------------------------------------------------------------------------ +; Name: Register8ToVector +; Purpose: Writes 8-bit main register values into 128-bit vector register +; without clearing the unused bits. +; Params: dword [esp + 4] +;------------------------------------------------------------------------------ +Register8ToVector: +_Register8ToVector: + mov eax, [esp + 4] + sal eax, 4 ; Force some repetition. +.L1: + pinsrb xmm1, al, 0 + pinsrb xmm2, bl, 1 + pinsrb xmm3, cl, 2 + pinsrb xmm1, dl, 3 + pinsrb xmm2, al, 4 + pinsrb xmm3, bl, 5 + pinsrb xmm0, cl, 6 + pinsrb xmm0, dl, 7 + + pinsrb xmm0, al, 0 + pinsrb xmm1, bl, 1 + pinsrb xmm2, cl, 2 + pinsrb xmm3, dl, 3 + pinsrb xmm3, al, 4 + pinsrb xmm2, bl, 5 + pinsrb xmm1, cl, 6 + pinsrb xmm0, dl, 7 + + dec eax + jnz .L1 + ret + +;------------------------------------------------------------------------------ +; Name: Register16ToVector +; Purpose: Writes 16-bit main register values into 128-bit vector register +; without clearing the unused bits. +; Params: rdi = loops +;------------------------------------------------------------------------------ +Register16ToVector: +_Register16ToVector: + mov eax, [esp + 4] + sal eax, 3 ; Force some repetition. +.L1: + pinsrw xmm1, ax, 0 + pinsrw xmm2, bx, 1 + pinsrw xmm3, cx, 2 + pinsrw xmm1, dx, 3 + pinsrw xmm2, si, 4 + pinsrw xmm3, di, 5 + pinsrw xmm0, bp, 6 + pinsrw xmm0, sp, 7 + + pinsrw xmm0, ax, 0 + pinsrw xmm1, bx, 1 + pinsrw xmm2, cx, 2 + pinsrw xmm3, dx, 3 + pinsrw xmm3, si, 4 + pinsrw xmm2, di, 5 + pinsrw xmm1, bp, 6 + pinsrw xmm0, sp, 7 + + dec eax + jnz .L1 + ret + +;------------------------------------------------------------------------------ +; Name: Register32ToVector +; Purpose: Writes 32-bit main register values into 128-bit vector register +; without clearing the unused bits. +; Params: rdi = loops +;------------------------------------------------------------------------------ +Register32ToVector: +_Register32ToVector: + mov eax, [esp + 4] + sal eax, 2 ; Force some repetition. +.L1: + pinsrd xmm1, eax, 0 ; Each xfer moves 4 bytes so to move 256 bytes + pinsrd xmm2, ebx, 1 ; we need 64 transfers. + pinsrd xmm3, ecx, 2 + pinsrd xmm1, edx, 3 + pinsrd xmm2, esi, 0 + pinsrd xmm3, edi, 1 + pinsrd xmm0, ebp, 2 + pinsrd xmm0, esp, 3 + + pinsrd xmm0, eax, 0 + pinsrd xmm1, ebx, 1 + pinsrd xmm2, ecx, 2 + pinsrd xmm3, edx, 3 + pinsrd xmm3, esi, 3 + pinsrd xmm2, edi, 2 + pinsrd xmm1, ebp, 1 + pinsrd xmm0, esp, 0 + + dec eax + jnz .L1 + ret + +;------------------------------------------------------------------------------ +; Name: Register64ToVector +; Purpose: Writes 64-bit main register values into 128-bit vector register +; without clearing the unused bits. +; Params: rdi = loops +;------------------------------------------------------------------------------ +Register64ToVector: +_Register64ToVector: + ; There are no 64-bit registers on x86. + ret + + +;------------------------------------------------------------------------------ +; Name: Vector8ToRegister +; Purpose: Writes 8-bit vector register values into main register. +; Params: rdi = loops +;------------------------------------------------------------------------------ +Vector8ToRegister: +_Vector8ToRegister: + mov eax, [esp + 4] + sal eax, 4 ; Force some repetition. + push ebx +.L1: + pextrb ebx, xmm1, 0 + pextrb ebx, xmm2, 1 + pextrb ebx, xmm3, 2 + pextrb ebx, xmm1, 3 + pextrb ebx, xmm2, 4 + pextrb ebx, xmm3, 5 + pextrb ebx, xmm0, 6 + pextrb ebx, xmm0, 7 + + pextrb ebx, xmm0, 0 + pextrb ebx, xmm1, 1 + pextrb ebx, xmm2, 2 + pextrb ebx, xmm3, 3 + pextrb ebx, xmm3, 4 + pextrb ebx, xmm2, 5 + pextrb ebx, xmm1, 6 + pextrb ebx, xmm0, 7 + + dec eax + jnz .L1 + pop ebx + ret + +;------------------------------------------------------------------------------ +; Name: Vector16ToRegister +; Purpose: Writes 16-bit vector register values into main register. +; Params: rdi = loops +;------------------------------------------------------------------------------ +Vector16ToRegister: +_Vector16ToRegister: + mov eax, [esp + 4] + sal eax, 3 ; Force some repetition. + push ebx +.L1: + pextrw ebx, xmm1, 0 ; 256 byte chunk / 2 bytes/xfer = 128 xfers. + pextrw ebx, xmm2, 1 + pextrw ebx, xmm3, 2 + pextrw ebx, xmm1, 3 + pextrw ebx, xmm2, 4 + pextrw ebx, xmm3, 5 + pextrw ebx, xmm0, 6 + pextrw ebx, xmm0, 7 + + pextrw ebx, xmm0, 0 + pextrw ebx, xmm1, 1 + pextrw ebx, xmm2, 2 + pextrw ebx, xmm3, 3 + pextrw ebx, xmm3, 4 + pextrw ebx, xmm2, 5 + pextrw ebx, xmm1, 6 + pextrw ebx, xmm0, 7 + + dec eax + jnz .L1 + pop ebx + ret + +;------------------------------------------------------------------------------ +; Name: Vector32ToRegister +; Purpose: Writes 32-bit vector register values into main register. +; Params: rdi = loops +;------------------------------------------------------------------------------ +Vector32ToRegister: +_Vector32ToRegister: + mov eax, [esp + 4] + sal eax, 2 ; Force some repetition. + push ebx +.L1: + pextrd ebx, xmm1, 0 ; 256 byte chunk / 4 bytes/xfer = 64 xfers. + pextrd ebx, xmm2, 1 + pextrd ebx, xmm3, 2 + pextrd ebx, xmm1, 3 + pextrd ebx, xmm2, 0 + pextrd ebx, xmm3, 1 + pextrd ebx, xmm0, 2 + pextrd ebx, xmm0, 3 + + pextrd ebx, xmm0, 0 + pextrd ebx, xmm1, 1 + pextrd ebx, xmm2, 2 + pextrd ebx, xmm3, 3 + pextrd ebx, xmm3, 3 + pextrd ebx, xmm2, 2 + pextrd ebx, xmm1, 1 + pextrd ebx, xmm0, 0 + + dec eax + jnz .L1 + pop ebx + ret + +;------------------------------------------------------------------------------ +; Name: Vector64ToRegister +; Purpose: Writes 64-bit vector register values into main register. +; Params: rdi = loops +;------------------------------------------------------------------------------ +Vector64ToRegister: +_Vector64ToRegister: + ; There are no 64-bit registers on x86. + ret + +;------------------------------------------------------------------------------ +; Name: CopySSE +; Purpose: Copies memory chunks that are 16-byte aligned. +; Params: [esp + 4] = ptr to destination memory area +; [esp + 8] = ptr to source memory area +; [esp + 12] = length in bytes +; [esp + 16] = loops +;------------------------------------------------------------------------------ +CopySSE: +_CopySSE: + ; Register usage: + ; esi = source + ; edi = dest + ; ecx = loops + ; edx = length + push esi + push edi + push ecx + push edx + + mov edi, [esp + 4 + 16] + mov esi, [esp + 8 + 16] + mov edx, [esp + 12 + 16] + mov ecx, [esp + 16 + 16] + + shr edx, 7 ; Ensure length is multiple of 128. + shl edx, 7 + + ; Save our non-parameter XMM registers. + sub esp, 64 + movdqu [esp], xmm4 + movdqu [16+esp], xmm5 + movdqu [32+esp], xmm6 + movdqu [48+esp], xmm7 + +.L1: + mov eax, edx + +.L2: + ; prefetchnta [esi] + movdqa xmm0, [esi] + movdqa xmm1, [16+esi] + movdqa xmm2, [32+esi] + movdqa xmm3, [48+esi] + movdqa xmm4, [64+esi] + movdqa xmm5, [80+esi] + movdqa xmm6, [96+esi] + movdqa xmm7, [112+esi] + + movntdq [edi], xmm0 + movntdq [16+edi], xmm1 + movntdq [32+edi], xmm2 + movntdq [48+edi], xmm3 + movntdq [64+edi], xmm4 + movntdq [80+edi], xmm5 + movntdq [96+edi], xmm6 + movntdq [112+edi], xmm7 + + add esi, 128 + add edi, 128 + + sub eax, 128 + jnz .L2 + + sub esi, edx ; rsi now points to start. + sub edi, edx ; rdi now points to start. + + dec ecx + jnz .L1 + + movdqu xmm4, [0+esp] + movdqu xmm5, [16+esp] + movdqu xmm6, [32+esp] + movdqu xmm7, [48+esp] + add esp, 64 + + pop edx + pop ecx + pop edi + pop esi + ret
diff --git a/routines64.asm b/routines64.asm new file mode 100644 index 0000000..18e8f6e --- /dev/null +++ b/routines64.asm
@@ -0,0 +1,1516 @@ + +; ============================================================================ +; bandwidth 0.23, a benchmark to estimate memory transfer bandwidth. +; Copyright (C) 2005-2010 by Zack T Smith. +; +; This program is free software; you can redistribute it and/or modify +; it under the terms of the GNU General Public License as published by +; the Free Software Foundation; either version 2 of the License, or +; (at your option) any later version. +; +; This program is distributed in the hope that it will be useful, +; but WITHOUT ANY WARRANTY; without even the implied warranty of +; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +; GNU General Public License for more details. +; +; You should have received a copy of the GNU General Public License +; along with this program; if not, write to the Free Software +; Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA +; +; The author may be reached at fbui@comcast.net. +; ============================================================================= + +bits 64 +cpu x64 + +global Reader +global RandomReader +global ReaderSSE2 +global RandomReaderSSE2 + +global Writer +global RandomWriter +global WriterSSE2 +global RandomWriterSSE2 + +global WriterSSE2_bypass +global RandomWriterSSE2_bypass + +global CopySSE +global _CopySSE + +global has_sse2 + +global RegisterToRegister +global RegisterToVector +global VectorToRegister +global VectorToVector + +global Register8ToVector +global Register16ToVector +global Register32ToVector +global Register64ToVector +global Vector8ToRegister +global Vector16ToRegister +global Vector32ToRegister +global Vector64ToRegister + +global StackReader +global StackWriter + +global _Reader +global _RandomReader +global _ReaderSSE2 +global _RandomReaderSSE2 + +global _Writer +global _RandomWriter +global _WriterSSE2 +global _RandomWriterSSE2 + +global _WriterSSE2_bypass +global _RandomWriterSSE2_bypass + +global _has_sse2 + +global _RegisterToRegister +global _RegisterToVector +global _VectorToRegister +global _VectorToVector + +global _Register8ToVector +global _Register16ToVector +global _Register32ToVector +global _Register64ToVector +global _Vector8ToRegister +global _Vector16ToRegister +global _Vector32ToRegister +global _Vector64ToRegister + +global _StackReader +global _StackWriter + +; Note: +; Unix ABI says integer param are put in these registers in this order: +; rdi, rsi, rdx, rcx, r8, r9 + + section .text + +;------------------------------------------------------------------------------ +; Name: has_sse2 +; +has_sse2: +_has_sse2: + push rbx + push rcx + push rdx + mov rax, 1 + cpuid + test rdx, 0x4000000 + setnz al + pop rdx + pop rcx + pop rbx + ret + +;------------------------------------------------------------------------------ +; Name: Reader +; Purpose: Reads 64-bit values sequentially from an area of memory. +; Params: rdi = ptr to memory area +; rsi = length in bytes +; rdx = loops +;------------------------------------------------------------------------------ +Reader: +_Reader: + push r10 + + add rsi, rdi ; rdi now points to end. + +.L1: + mov r10, rdi + +.L2: + mov rax, [r10] + mov rax, [8+r10] + mov rax, [16+r10] + mov rax, [24+r10] + mov rax, [32+r10] + mov rax, [40+r10] + mov rax, [48+r10] + mov rax, [56+r10] + mov rax, [64+r10] + mov rax, [72+r10] + mov rax, [80+r10] + mov rax, [88+r10] + mov rax, [96+r10] + mov rax, [104+r10] + mov rax, [112+r10] + mov rax, [120+r10] + mov rax, [128+r10] + mov rax, [136+r10] + mov rax, [144+r10] + mov rax, [152+r10] + mov rax, [160+r10] + mov rax, [168+r10] + mov rax, [176+r10] + mov rax, [184+r10] + mov rax, [192+r10] + mov rax, [200+r10] + mov rax, [208+r10] + mov rax, [216+r10] + mov rax, [224+r10] + mov rax, [232+r10] + mov rax, [240+r10] + mov rax, [248+r10] + + add r10, 256 + cmp r10, rsi + jb .L2 + + dec rdx + jnz .L1 + + pop r10 + ret + +;------------------------------------------------------------------------------ +; Name: RandomReader +; Purpose: Reads 64-bit values randomly from an area of memory. +; Params: rdi = ptr to array of chunk pointers +; rsi = # of chunks +; rdx = loops +;------------------------------------------------------------------------------ +RandomReader: +_RandomReader: + push r10 + push r11 + +.L1: + xor r11, r11 + +.L2: + mov r10, [rdi + 8*r11] ; Note, 64-bit pointers. + + mov rax, [96+r10] + mov rax, [r10] + mov rax, [120+r10] + mov rax, [184+r10] + mov rax, [160+r10] + mov rax, [176+r10] + mov rax, [112+r10] + mov rax, [80+r10] + mov rax, [32+r10] + mov rax, [128+r10] + mov rax, [88+r10] + mov rax, [40+r10] + mov rax, [48+r10] + mov rax, [72+r10] + mov rax, [200+r10] + mov rax, [24+r10] + mov rax, [152+r10] + mov rax, [16+r10] + mov rax, [248+r10] + mov rax, [56+r10] + mov rax, [240+r10] + mov rax, [208+r10] + mov rax, [104+r10] + mov rax, [216+r10] + mov rax, [136+r10] + mov rax, [232+r10] + mov rax, [64+r10] + mov rax, [224+r10] + mov rax, [144+r10] + mov rax, [192+r10] + mov rax, [8+r10] + mov rax, [168+r10] + + inc r11 + cmp r11, rsi + jb .L2 + + dec rdx + jnz .L1 + + pop r11 + pop r10 + ret + +;------------------------------------------------------------------------------ +; Name: RandomReaderSSE2 +; Purpose: Reads 128-bit values randomly from an area of memory. +; Params: rdi = ptr to array of chunk pointers +; rsi = # of chunks +; rdx = loops +;------------------------------------------------------------------------------ +RandomReaderSSE2: +_RandomReaderSSE2: + push r10 + push r11 + +.L1: + xor r11, r11 + +.L2: + mov r10, [rdi + 8*r11] + + movdqa xmm0, [240+r10] + movdqa xmm0, [128+r10] + movdqa xmm0, [64+r10] + movdqa xmm0, [208+r10] + movdqa xmm0, [112+r10] + movdqa xmm0, [176+r10] + movdqa xmm0, [144+r10] + movdqa xmm0, [r10] + movdqa xmm0, [96+r10] + movdqa xmm0, [16+r10] + movdqa xmm0, [192+r10] + movdqa xmm0, [160+r10] + movdqa xmm0, [32+r10] + movdqa xmm0, [48+r10] + movdqa xmm0, [224+r10] + movdqa xmm0, [80+r10] + + inc r11 + cmp r11, rsi + jb .L2 + + dec rdx + jnz .L1 + + pop r11 + pop r10 + ret + +;------------------------------------------------------------------------------ +; Name: RandomWriter +; Purpose: Writes 64-bit values randomly to an area of memory. +; Params: rdi = ptr to array of chunk pointers +; rsi = # of chunks +; rdx = loops +; rcx = datum to write +;------------------------------------------------------------------------------ +RandomWriter: +_RandomWriter: + push r10 + push r11 + +.L1: + xor r11, r11 + +.L2: + mov r10, [rdi + 8*r11] ; Note, 64-bit pointers. + + mov [96+r10], rcx + mov [r10], rcx + mov [120+r10], rcx + mov [184+r10], rcx + mov [160+r10], rcx + mov [176+r10], rcx + mov [112+r10], rcx + mov [80+r10], rcx + mov [32+r10], rcx + mov [128+r10], rcx + mov [88+r10], rcx + mov [40+r10], rcx + mov [48+r10], rcx + mov [72+r10], rcx + mov [200+r10], rcx + mov [24+r10], rcx + mov [152+r10], rcx + mov [16+r10], rcx + mov [248+r10], rcx + mov [56+r10], rcx + mov [240+r10], rcx + mov [208+r10], rcx + mov [104+r10], rcx + mov [216+r10], rcx + mov [136+r10], rcx + mov [232+r10], rcx + mov [64+r10], rcx + mov [224+r10], rcx + mov [144+r10], rcx + mov [192+r10], rcx + mov [8+r10], rcx + mov [168+r10], rcx + + inc r11 + cmp r11, rsi + jb .L2 + + dec rdx + jnz .L1 + + pop r11 + pop r10 + ret + +;------------------------------------------------------------------------------ +; Name: RandomWriterSSE2 +; Purpose: Writes 128-bit values randomly to an area of memory. +; Params: rdi = ptr to array of chunk pointers +; rsi = # of chunks +; rdx = loops +; rcx = datum to write +;------------------------------------------------------------------------------ +RandomWriterSSE2: +_RandomWriterSSE2: + push r10 + push r11 + + movq xmm0, rcx ; Create duplicated 128-bit datum + movq xmm1, rcx + pslldq xmm1, 64 + por xmm0, xmm1 + +.L1: + xor r11, r11 + +.L2: + mov r10, [rdi + 8*r11] ; Note, 64-bit pointers. + + movdqa [240+r10], xmm0 + movdqa [128+r10], xmm0 + movdqa [208+r10], xmm0 + movdqa [112+r10], xmm0 + movdqa [64+r10], xmm0 + movdqa [176+r10], xmm0 + movdqa [144+r10], xmm0 + movdqa [r10], xmm0 + movdqa [96+r10], xmm0 + movdqa [16+r10], xmm0 + movdqa [192+r10], xmm0 + movdqa [160+r10], xmm0 + movdqa [32+r10], xmm0 + movdqa [48+r10], xmm0 + movdqa [224+r10], xmm0 + movdqa [80+r10], xmm0 + + inc r11 + cmp r11, rsi + jb .L2 + + dec rdx + jnz .L1 + + pop r11 + pop r10 + ret + +;------------------------------------------------------------------------------ +; Name: RandomWriterSSE2_bypass +; Purpose: Writes 128-bit values randomly into memory, bypassing caches. +; Params: rdi = ptr to array of chunk pointers +; rsi = # of chunks +; rdx = loops +; rcx = datum to write +;------------------------------------------------------------------------------ +RandomWriterSSE2_bypass: +_RandomWriterSSE2_bypass: + push r10 + push r11 + + movq xmm0, rcx ; Create duplicated 128-bit datum + movq xmm1, rcx + pslldq xmm1, 64 + por xmm0, xmm1 + +.L1: + xor r11, r11 + +.L2: + mov r10, [rdi + 8*r11] ; Note, 64-bit pointers. + + movntdq [240+r10], xmm0 + movntdq [128+r10], xmm0 + movntdq [208+r10], xmm0 + movntdq [112+r10], xmm0 + movntdq [64+r10], xmm0 + movntdq [176+r10], xmm0 + movntdq [144+r10], xmm0 + movntdq [r10], xmm0 + movntdq [96+r10], xmm0 + movntdq [16+r10], xmm0 + movntdq [192+r10], xmm0 + movntdq [160+r10], xmm0 + movntdq [32+r10], xmm0 + movntdq [48+r10], xmm0 + movntdq [224+r10], xmm0 + movntdq [80+r10], xmm0 + + inc r11 + cmp r11, rsi + jb .L2 + + dec rdx + jnz .L1 + + pop r11 + pop r10 + ret + +;------------------------------------------------------------------------------ +; Name: ReaderSSE2 +; Purpose: Reads 128-bit values sequentially from an area of memory. +; Params: rdi = ptr to memory area +; rsi = length in bytes +; rdx = loops +;------------------------------------------------------------------------------ +ReaderSSE2: +_ReaderSSE2: + push r10 + + add rsi, rdi ; rsi now points to end. + +.L1: + mov r10, rdi + +.L2: + movdqa xmm0, [r10] ; Read aligned to 16-byte boundary. + movdqa xmm0, [16+r10] + movdqa xmm0, [32+r10] + movdqa xmm0, [48+r10] + movdqa xmm0, [64+r10] + movdqa xmm0, [80+r10] + movdqa xmm0, [96+r10] + movdqa xmm0, [112+r10] + + movdqa xmm0, [128+r10] + movdqa xmm0, [144+r10] + movdqa xmm0, [160+r10] + movdqa xmm0, [176+r10] + movdqa xmm0, [192+r10] + movdqa xmm0, [208+r10] + movdqa xmm0, [224+r10] + movdqa xmm0, [240+r10] + + add r10, 256 + cmp r10, rsi + jb .L2 + + dec rdx + jnz .L1 + + pop r10 + ret + + +;------------------------------------------------------------------------------ +; Name: Writer +; Purpose: Writes 64-bit value sequentially to an area of memory. +; Params: rdi = ptr to memory area +; rsi = length in bytes +; rdx = loops +; rcx = quad to write +;------------------------------------------------------------------------------ +Writer: +_Writer: + push r10 + + add rsi, rdi ; rsi now points to end. + +.L1: + mov r10, rdi + +.L2: + mov [r10], rcx + mov [8+r10], rcx + mov [16+r10], rcx + mov [24+r10], rcx + mov [32+r10], rcx + mov [40+r10], rcx + mov [48+r10], rcx + mov [56+r10], rcx + mov [64+r10], rcx + mov [72+r10], rcx + mov [80+r10], rcx + mov [88+r10], rcx + mov [96+r10], rcx + mov [104+r10], rcx + mov [112+r10], rcx + mov [120+r10], rcx + mov [128+r10], rcx + mov [136+r10], rcx + mov [144+r10], rcx + mov [152+r10], rcx + mov [160+r10], rcx + mov [168+r10], rcx + mov [176+r10], rcx + mov [184+r10], rcx + mov [192+r10], rcx + mov [200+r10], rcx + mov [208+r10], rcx + mov [216+r10], rcx + mov [224+r10], rcx + mov [232+r10], rcx + mov [240+r10], rcx + mov [248+r10], rcx + + add r10, 256 + cmp r10, rsi + jb .L2 + + dec rdx + jnz .L1 + + pop r10 + ret + +;------------------------------------------------------------------------------ +; Name: WriterSSE2 +; Purpose: Writes 128-bit value sequentially to an area of memory. +; Params: rdi = ptr to memory area +; rsi = length in bytes +; rdx = loops +; rcx = quad to write +;------------------------------------------------------------------------------ +WriterSSE2: +_WriterSSE2: + push r10 + + add rsi, rdi ; rsi now points to end. + + movq xmm0, rcx + +.L1: + mov r10, rdi + +.L2: + movdqa [r10], xmm0 + movdqa [16+r10], xmm0 + movdqa [32+r10], xmm0 + movdqa [48+r10], xmm0 + movdqa [64+r10], xmm0 + movdqa [80+r10], xmm0 + movdqa [96+r10], xmm0 + movdqa [112+r10], xmm0 + + movdqa [128+r10], xmm0 + movdqa [144+r10], xmm0 + movdqa [160+r10], xmm0 + movdqa [176+r10], xmm0 + movdqa [192+r10], xmm0 + movdqa [208+r10], xmm0 + movdqa [224+r10], xmm0 + movdqa [240+r10], xmm0 + + add r10, 256 + cmp r10, rsi + jb .L2 + + dec rdx + jnz .L1 + + pop r10 + ret + +;------------------------------------------------------------------------------ +; Name: WriterSSE2_bypass +; Purpose: Writes 128-bit value sequentially to an area of memory. +; Params: rdi = ptr to memory area +; rsi = length in bytes +; rdx = loops +; rcx = quad to write +;------------------------------------------------------------------------------ +WriterSSE2_bypass: +_WriterSSE2_bypass: + push r10 + + add rsi, rdi ; rsi now points to end. + + movq xmm0, rcx + +.L1: + mov r10, rdi + +.L2: + movntdq [r10], xmm0 ; Write bypassing cache. + movntdq [16+r10], xmm0 + movntdq [32+r10], xmm0 + movntdq [48+r10], xmm0 + movntdq [64+r10], xmm0 + movntdq [80+r10], xmm0 + movntdq [96+r10], xmm0 + movntdq [112+r10], xmm0 + + movntdq [128+r10], xmm0 + movntdq [144+r10], xmm0 + movntdq [160+r10], xmm0 + movntdq [176+r10], xmm0 + movntdq [192+r10], xmm0 + movntdq [208+r10], xmm0 + movntdq [224+r10], xmm0 + movntdq [240+r10], xmm0 + + add r10, 256 + cmp r10, rsi + jb .L2 + + dec rdx + jnz .L1 + + pop r10 + ret + +;------------------------------------------------------------------------------ +; Name: StackReader +; Purpose: Reads 64-bit values off the stack into registers of +; the main register set, effectively testing L1 cache access +; *and* effective-address calculation speed. +; Params: rdi = loops +;------------------------------------------------------------------------------ +StackReader: +_StackReader: + push qword 7000 ; [rsp+48] + push qword 6000 ; [rsp+40] + push qword 5000 ; [rsp+32] + push qword 4000 ; [rsp+24] + push qword 3000 ; [rsp+16] + push qword 2000 ; [rsp+8] + push qword 1000 ; [rsp] + +.L1: + mov rax, [rsp] + mov rax, [rsp+16] + mov rax, [rsp+24] + mov rax, [rsp+32] + mov rax, [rsp+80] + mov rax, [rsp+8] + mov rax, [rsp+88] + mov rax, [rsp] + mov rax, [rsp] + mov rax, [rsp+16] + mov rax, [rsp+24] + mov rax, [rsp+32] + mov rax, [rsp+80] + mov rax, [rsp+8] + mov rax, [rsp+88] + mov rax, [rsp] + mov rax, [rsp] + mov rax, [rsp+16] + mov rax, [rsp+24] + mov rax, [rsp+32] + mov rax, [rsp+80] + mov rax, [rsp+8] + mov rax, [rsp+88] + mov rax, [rsp+8] + mov rax, [rsp+8] + mov rax, [rsp+16] + mov rax, [rsp+24] + mov rax, [rsp+32] + mov rax, [rsp+80] + mov rax, [rsp+8] + mov rax, [rsp+88] + mov rax, [rsp+8] + + sub rdi, 1 + jnz .L1 + + add rsp, 56 + ret + +;------------------------------------------------------------------------------ +; Name: StackWriter +; Purpose: Writes 64-bit values into the stack from registers of +; the main register set, effectively testing L1 cache access +; *and* effective-address calculation speed. +; Params: rdi = loops +;------------------------------------------------------------------------------ +StackWriter: +_StackWriter: + push qword 7000 ; [rsp+88] + push qword 6000 ; [rsp+80] + push qword 5000 ; [rsp+32] + push qword 4000 ; [rsp+24] + push qword 3000 ; [rsp+16] + push qword 2000 ; [rsp+8] + push qword 1000 ; [rsp] + + xor rax, rax + +.L1: + mov [rsp], rax + mov [rsp+16], rax + mov [rsp+24], rax + mov [rsp+32], rax + mov [rsp+80], rax + mov [rsp+8], rax + mov [rsp+88], rax + mov [rsp], rax + mov [rsp], rax + mov [rsp+16], rax + mov [rsp+24], rax + mov [rsp+32], rax + mov [rsp+80], rax + mov [rsp+8], rax + mov [rsp+88], rax + mov [rsp], rax + mov [rsp], rax + mov [rsp+16], rax + mov [rsp+24], rax + mov [rsp+32], rax + mov [rsp+80], rax + mov [rsp+8], rax + mov [rsp+88], rax + mov [rsp+8], rax + mov [rsp+8], rax + mov [rsp+16], rax + mov [rsp+24], rax + mov [rsp+32], rax + mov [rsp+80], rax + mov [rsp+8], rax + mov [rsp+88], rax + mov [rsp+8], rax + + sub rdi, 1 + jnz .L1 + + add rsp, 56 + ret + +;------------------------------------------------------------------------------ +; Name: RegisterToRegister +; Purpose: Reads/writes 64-bit values between registers of +; the main register set. +; Params: rdi = loops +;------------------------------------------------------------------------------ +RegisterToRegister: +_RegisterToRegister: +.L1: + mov rax, rbx + mov rax, rcx + mov rax, rdx + mov rax, rsi + mov rax, rdi + mov rax, rbp + mov rax, rsp + mov rax, rbx + mov rax, rbx + mov rax, rcx + mov rax, rdx + mov rax, rsi + mov rax, rdi + mov rax, rbp + mov rax, rsp + mov rax, rbx + mov rax, rbx + mov rax, rcx + mov rax, rdx + mov rax, rsi + mov rax, rdi + mov rax, rbp + mov rax, rsp + mov rax, rbx + mov rax, rbx + mov rax, rcx + mov rax, rdx + mov rax, rsi + mov rax, rdi + mov rax, rbp + mov rax, rsp + mov rax, rbx + + sub rdi, 1 + jnz .L1 + ret + +;------------------------------------------------------------------------------ +; Name: VectorToVector +; Purpose: Reads/writes 128-bit values between registers of +; the vector register set, in this case XMM. +; (I don't have access to anything with YMM.) +; Params: rdi = loops +;------------------------------------------------------------------------------ +VectorToVector: +_VectorToVector: +.L1: + movdqa xmm0, xmm1 ; Each movdqa moves 16 bytes, so we need 16 + movdqa xmm0, xmm2 ; moves to transfer a 256 byte chunk. + movdqa xmm0, xmm3 + movdqa xmm2, xmm0 + movdqa xmm1, xmm2 + movdqa xmm2, xmm1 + movdqa xmm0, xmm3 + movdqa xmm3, xmm1 + + movdqa xmm3, xmm2 + movdqa xmm1, xmm3 + movdqa xmm2, xmm1 + movdqa xmm0, xmm1 + movdqa xmm1, xmm2 + movdqa xmm0, xmm1 + movdqa xmm0, xmm3 + movdqa xmm3, xmm0 + + sub rdi, 1 + jnz .L1 + ret + +;------------------------------------------------------------------------------ +; Name: RegisterToVector +; Purpose: Writes 64-bit main register values into 128-bit vector register +; clearing the upper unused bits. +; Params: rdi = loops +;------------------------------------------------------------------------------ +RegisterToVector: +_RegisterToVector: +.L1: + movq xmm1, rax ; Each movq transfers 8 bytes, so we need + movq xmm2, rsi ; 32 transfers to move a 256-byte chunk. + movq xmm3, rbx + movq xmm1, rcx + movq xmm2, rsi + movq xmm3, rsp + movq xmm0, rdi + movq xmm0, rdx + + movq xmm0, rax + movq xmm1, rsi + movq xmm2, rbx + movq xmm3, rcx + movq xmm0, rsi + movq xmm3, rsp + movq xmm2, rdi + movq xmm1, rdx + + movq xmm0, rax + movq xmm1, rsi + movq xmm2, rbx + movq xmm3, rcx + movq xmm0, rsi + movq xmm3, rsp + movq xmm2, rdi + movq xmm1, rdx + + movq xmm0, rax + movq xmm1, rsi + movq xmm2, rbx + movq xmm3, rcx + movq xmm0, rsi + movq xmm3, rsp + movq xmm2, rdi + movq xmm1, rdx + + dec rdi + jnz .L1 + ret + +;------------------------------------------------------------------------------ +; Name: VectorToRegister +; Purpose: Writes lower 64 bits of vector register into 64-bit main +; register. +; Params: rdi = loops +;------------------------------------------------------------------------------ +VectorToRegister: +_VectorToRegister: +.L1: + movq rax, xmm1 + movq rax, xmm2 + movq rax, xmm3 + movq rax, xmm1 + movq rax, xmm2 + movq rax, xmm3 + movq rax, xmm0 + movq rax, xmm0 + + movq rax, xmm0 + movq rax, xmm1 + movq rax, xmm2 + movq rax, xmm3 + movq rax, xmm0 + movq rax, xmm3 + movq rax, xmm2 + movq rax, xmm1 + + movq rax, xmm0 + movq rax, xmm1 + movq rax, xmm2 + movq rax, xmm3 + movq rax, xmm0 + movq rax, xmm3 + movq rax, xmm2 + movq rax, xmm1 + + movq rax, xmm0 + movq rax, xmm1 + movq rax, xmm2 + movq rax, xmm3 + movq rax, xmm0 + movq rax, xmm3 + movq rax, xmm2 + movq rax, xmm1 + + dec rdi + jnz .L1 + ret + +;------------------------------------------------------------------------------ +; Name: Register8ToVector +; Purpose: Writes 8-bit main register values into 128-bit vector register +; without clearing the unused bits. +; Params: rdi = loops +;------------------------------------------------------------------------------ +Register8ToVector: +_Register8ToVector: + sal rdi, 2 ; Force some repetition. +.L1: + pinsrb xmm1, al, 0 + pinsrb xmm2, bl, 1 + pinsrb xmm3, cl, 2 + pinsrb xmm1, dl, 3 + pinsrb xmm2, sil, 4 + pinsrb xmm3, dil, 5 + pinsrb xmm0, bpl, 6 + pinsrb xmm0, spl, 7 + + pinsrb xmm0, al, 0 + pinsrb xmm1, bl, 1 + pinsrb xmm2, cl, 2 + pinsrb xmm3, dl, 3 + pinsrb xmm3, al, 4 + pinsrb xmm2, bl, 5 + pinsrb xmm1, bpl, 6 + pinsrb xmm0, spl, 7 + + pinsrb xmm1, r8b, 0 + pinsrb xmm2, r9b, 1 + pinsrb xmm3, r10b, 2 + pinsrb xmm1, r11b, 3 + pinsrb xmm2, r12b, 4 + pinsrb xmm3, al, 5 + pinsrb xmm0, cl, 6 + pinsrb xmm0, bl, 7 + + pinsrb xmm0, r8b, 0 + pinsrb xmm0, r9b, 1 + pinsrb xmm0, r10b, 2 + pinsrb xmm0, r11b, 3 + pinsrb xmm0, r12b, 4 + pinsrb xmm0, al, 5 + pinsrb xmm0, cl, 6 + pinsrb xmm0, bl, 7 + + pinsrb xmm1, al, 0 + pinsrb xmm2, bl, 1 + pinsrb xmm3, cl, 2 + pinsrb xmm1, dl, 3 + pinsrb xmm2, sil, 4 + pinsrb xmm3, dil, 5 + pinsrb xmm0, bpl, 6 + pinsrb xmm0, spl, 7 + + pinsrb xmm0, al, 10 + pinsrb xmm1, bl, 11 + pinsrb xmm2, cl, 12 + pinsrb xmm3, dl, 13 + pinsrb xmm3, dil, 14 + pinsrb xmm2, cl, 15 + pinsrb xmm1, al, 6 + pinsrb xmm0, bpl, 7 + + pinsrb xmm1, r8b, 10 + pinsrb xmm2, r9b, 11 + pinsrb xmm3, r10b, 12 + pinsrb xmm1, r11b, 13 + pinsrb xmm2, r12b, 14 + pinsrb xmm3, al, 15 + pinsrb xmm0, cl, 6 + pinsrb xmm0, bl, 7 + + pinsrb xmm0, r8b, 9 + pinsrb xmm0, r9b, 8 + pinsrb xmm0, r10b, 11 + pinsrb xmm0, r11b, 3 + pinsrb xmm0, r12b, 4 + pinsrb xmm0, al, 5 + pinsrb xmm0, cl, 6 + pinsrb xmm0, bl, 7 + + dec rdi + jnz .L1 + ret + +;------------------------------------------------------------------------------ +; Name: Register16ToVector +; Purpose: Writes 16-bit main register values into 128-bit vector register +; without clearing the unused bits. +; Params: rdi = loops +;------------------------------------------------------------------------------ +Register16ToVector: +_Register16ToVector: + sal rdi, 1 ; Force some repetition. +.L1: + pinsrw xmm1, ax, 0 + pinsrw xmm2, bx, 1 + pinsrw xmm3, cx, 2 + pinsrw xmm1, dx, 3 + pinsrw xmm2, si, 4 + pinsrw xmm3, di, 5 + pinsrw xmm0, bp, 6 + pinsrw xmm0, sp, 7 + + pinsrw xmm0, ax, 0 + pinsrw xmm1, bx, 1 + pinsrw xmm2, cx, 2 + pinsrw xmm3, dx, 3 + pinsrw xmm3, si, 4 + pinsrw xmm2, di, 5 + pinsrw xmm1, bp, 6 + pinsrw xmm0, sp, 7 + + pinsrw xmm1, r8w, 0 + pinsrw xmm2, r9w, 1 + pinsrw xmm3, r10w, 2 + pinsrw xmm1, r11w, 3 + pinsrw xmm2, r12w, 4 + pinsrw xmm3, ax, 5 + pinsrw xmm0, bp, 6 + pinsrw xmm0, bx, 7 + + pinsrw xmm0, r8w, 0 + pinsrw xmm0, r9w, 1 + pinsrw xmm0, r10w, 2 + pinsrw xmm0, r11w, 3 + pinsrw xmm0, r12w, 4 + pinsrw xmm0, ax, 5 + pinsrw xmm0, bp, 6 + pinsrw xmm0, bx, 7 + + pinsrw xmm1, ax, 0 + pinsrw xmm2, bx, 1 + pinsrw xmm3, cx, 2 + pinsrw xmm1, dx, 3 + pinsrw xmm2, si, 4 + pinsrw xmm3, di, 5 + pinsrw xmm0, bp, 6 + pinsrw xmm0, sp, 7 + + pinsrw xmm0, ax, 0 + pinsrw xmm1, bx, 1 + pinsrw xmm2, cx, 2 + pinsrw xmm3, dx, 3 + pinsrw xmm3, si, 4 + pinsrw xmm2, di, 5 + pinsrw xmm1, bp, 6 + pinsrw xmm0, sp, 7 + + pinsrw xmm1, r8w, 0 + pinsrw xmm2, r9w, 1 + pinsrw xmm3, r10w, 2 + pinsrw xmm1, r11w, 3 + pinsrw xmm2, r12w, 4 + pinsrw xmm3, ax, 5 + pinsrw xmm0, bp, 6 + pinsrw xmm0, bx, 7 + + pinsrw xmm0, r8w, 0 + pinsrw xmm0, r9w, 1 + pinsrw xmm0, r10w, 2 + pinsrw xmm0, r11w, 3 + pinsrw xmm0, r12w, 4 + pinsrw xmm0, ax, 5 + pinsrw xmm0, bp, 6 + pinsrw xmm0, bx, 7 + + dec rdi + jnz .L1 + ret + +;------------------------------------------------------------------------------ +; Name: Register32ToVector +; Purpose: Writes 32-bit main register values into 128-bit vector register +; without clearing the unused bits. +; Params: rdi = loops +;------------------------------------------------------------------------------ +Register32ToVector: +_Register32ToVector: +.L1: + pinsrd xmm1, eax, 0 ; Each xfer moves 4 bytes so to move 256 bytes + pinsrd xmm2, ebx, 1 ; we need 64 transfers. + pinsrd xmm3, ecx, 2 + pinsrd xmm1, edx, 3 + pinsrd xmm2, esi, 0 + pinsrd xmm3, edi, 1 + pinsrd xmm0, ebp, 2 + pinsrd xmm0, esp, 3 + + pinsrd xmm0, eax, 0 + pinsrd xmm1, ebx, 1 + pinsrd xmm2, ecx, 2 + pinsrd xmm3, edx, 3 + pinsrd xmm3, esi, 3 + pinsrd xmm2, edi, 2 + pinsrd xmm1, ebp, 1 + pinsrd xmm0, esp, 0 + + pinsrd xmm1, r8d, 0 + pinsrd xmm2, r9d, 1 + pinsrd xmm3, r10d, 2 + pinsrd xmm1, r11d, 3 + pinsrd xmm2, r12d, 0 + pinsrd xmm3, eax, 1 + pinsrd xmm0, ebp, 2 + pinsrd xmm0, ebx, 3 + + pinsrd xmm0, r8d, 0 + pinsrd xmm0, r9d, 1 + pinsrd xmm0, r10d, 2 + pinsrd xmm0, r11d, 3 + pinsrd xmm0, r12d, 0 + pinsrd xmm0, eax, 0 + pinsrd xmm0, ebp, 0 + pinsrd xmm0, ebx, 0 + + pinsrd xmm1, eax, 0 + pinsrd xmm2, ebx, 1 + pinsrd xmm3, ecx, 2 + pinsrd xmm1, edx, 3 + pinsrd xmm2, esi, 0 + pinsrd xmm3, edi, 1 + pinsrd xmm0, ebp, 2 + pinsrd xmm0, esp, 3 + + pinsrd xmm0, eax, 0 + pinsrd xmm1, ebx, 1 + pinsrd xmm2, ecx, 2 + pinsrd xmm3, edx, 3 + pinsrd xmm3, esi, 3 + pinsrd xmm2, edi, 2 + pinsrd xmm1, ebp, 1 + pinsrd xmm0, esp, 0 + + pinsrd xmm1, r8d, 0 + pinsrd xmm2, r9d, 1 + pinsrd xmm3, r10d, 2 + pinsrd xmm1, r11d, 3 + pinsrd xmm2, r12d, 0 + pinsrd xmm3, eax, 1 + pinsrd xmm0, ebp, 2 + pinsrd xmm0, ebx, 3 + + pinsrd xmm0, r8d, 0 + pinsrd xmm0, r9d, 1 + pinsrd xmm0, r10d, 2 + pinsrd xmm0, r11d, 3 + pinsrd xmm0, r12d, 0 + pinsrd xmm0, eax, 0 + pinsrd xmm0, ebp, 0 + pinsrd xmm0, ebx, 0 + + dec rdi + jnz .L1 + ret + +;------------------------------------------------------------------------------ +; Name: Register64ToVector +; Purpose: Writes 64-bit main register values into 128-bit vector register +; without clearing the unused bits. +; Params: rdi = loops +;------------------------------------------------------------------------------ +Register64ToVector: +_Register64ToVector: + add rdi, rdi +.L1: + pinsrq xmm1, r8, 0 ; Each xfer moves 8 bytes, therefore to do + pinsrq xmm2, r9, 1 ; 256 bytes we need 32 transfers. + pinsrq xmm3, r10, 0 + pinsrq xmm1, r11, 1 + pinsrq xmm2, r12, 0 + pinsrq xmm3, rax, 1 + pinsrq xmm0, rbp, 0 + pinsrq xmm0, rbx, 1 + + pinsrq xmm0, r8, 0 + pinsrq xmm0, r9, 1 + pinsrq xmm0, r10, 1 + pinsrq xmm0, r11, 1 + pinsrq xmm0, r12, 0 + pinsrq xmm0, rax, 0 + pinsrq xmm0, rbp, 0 + pinsrq xmm0, rbx, 0 + + dec rdi + jnz .L1 + ret + + +;------------------------------------------------------------------------------ +; Name: Vector8ToRegister +; Purpose: Writes 8-bit vector register values into main register. +; Params: rdi = loops +;------------------------------------------------------------------------------ +Vector8ToRegister: +_Vector8ToRegister: + sal rdi, 3 ; Force some repetition. +.L1: + pextrb rax, xmm1, 0 + pextrb rax, xmm2, 1 + pextrb rax, xmm3, 2 + pextrb rax, xmm1, 3 + pextrb rax, xmm2, 4 + pextrb rax, xmm3, 5 + pextrb rax, xmm0, 6 + pextrb rax, xmm0, 7 + + pextrb rax, xmm0, 0 + pextrb rax, xmm1, 1 + pextrb rax, xmm2, 2 + pextrb rax, xmm3, 3 + pextrb rax, xmm3, 4 + pextrb rax, xmm2, 5 + pextrb rax, xmm1, 6 + pextrb rax, xmm0, 7 + + pextrb rax, xmm1, 0 + pextrb rax, xmm2, 1 + pextrb rax, xmm3, 2 + pextrb rax, xmm1, 3 + pextrb rax, xmm2, 4 + pextrb rax, xmm3, 5 + pextrb rax, xmm0, 6 + pextrb rax, xmm0, 7 + + pextrb rax, xmm0, 0 + pextrb rax, xmm0, 1 + pextrb rax, xmm0, 2 + pextrb rax, xmm0, 3 + pextrb rax, xmm0, 4 + pextrb rax, xmm0, 5 + pextrb rax, xmm0, 6 + pextrb rax, xmm0, 7 + + dec rdi + jnz .L1 + ret + +;------------------------------------------------------------------------------ +; Name: Vector16ToRegister +; Purpose: Writes 16-bit vector register values into main register. +; Params: rdi = loops +;------------------------------------------------------------------------------ +Vector16ToRegister: +_Vector16ToRegister: + sal rdi, 2 ; Force some repetition. +.L1: + pextrw rax, xmm1, 0 ; 256 byte chunk / 2 bytes/xfer = 128 xfers. + pextrw rax, xmm2, 1 + pextrw rax, xmm3, 2 + pextrw rax, xmm1, 3 + pextrw rax, xmm2, 4 + pextrw rax, xmm3, 5 + pextrw rax, xmm0, 6 + pextrw rax, xmm0, 7 + + pextrw rax, xmm0, 0 + pextrw rax, xmm1, 1 + pextrw rax, xmm2, 2 + pextrw rax, xmm3, 3 + pextrw rax, xmm3, 4 + pextrw rax, xmm2, 5 + pextrw rax, xmm1, 6 + pextrw rax, xmm0, 7 + + pextrw rax, xmm1, 0 + pextrw rax, xmm2, 1 + pextrw rax, xmm3, 2 + pextrw rax, xmm1, 3 + pextrw rax, xmm2, 4 + pextrw rax, xmm3, 5 + pextrw rax, xmm0, 6 + pextrw rax, xmm0, 7 + + pextrw rax, xmm0, 0 + pextrw rax, xmm0, 1 + pextrw rax, xmm0, 2 + pextrw rax, xmm0, 3 + pextrw rax, xmm0, 4 + pextrw rax, xmm0, 5 + pextrw rax, xmm0, 6 + pextrw rax, xmm0, 7 + + dec rdi + jnz .L1 + ret + +;------------------------------------------------------------------------------ +; Name: Vector32ToRegister +; Purpose: Writes 32-bit vector register values into main register. +; Params: rdi = loops +;------------------------------------------------------------------------------ +Vector32ToRegister: +_Vector32ToRegister: + add rdi, rdi +.L1: + pextrd eax, xmm1, 0 ; 256 byte chunk / 4 bytes/xfer = 64 xfers. + pextrd eax, xmm2, 1 + pextrd eax, xmm3, 2 + pextrd eax, xmm1, 3 + pextrd eax, xmm2, 0 + pextrd eax, xmm3, 1 + pextrd eax, xmm0, 2 + pextrd eax, xmm0, 3 + + pextrd eax, xmm0, 0 + pextrd eax, xmm1, 1 + pextrd eax, xmm2, 2 + pextrd eax, xmm3, 3 + pextrd eax, xmm3, 3 + pextrd eax, xmm2, 2 + pextrd eax, xmm1, 1 + pextrd eax, xmm0, 0 + + pextrd eax, xmm1, 0 + pextrd eax, xmm2, 1 + pextrd eax, xmm3, 2 + pextrd eax, xmm1, 3 + pextrd eax, xmm2, 0 + pextrd eax, xmm3, 1 + pextrd eax, xmm0, 2 + pextrd eax, xmm0, 3 + + pextrd eax, xmm0, 0 + pextrd eax, xmm0, 1 + pextrd eax, xmm0, 2 + pextrd eax, xmm0, 3 + pextrd eax, xmm0, 0 + pextrd eax, xmm0, 0 + pextrd eax, xmm0, 0 + pextrd eax, xmm0, 0 + + dec rdi + jnz .L1 + ret + +;------------------------------------------------------------------------------ +; Name: Vector64ToRegister +; Purpose: Writes 64-bit vector register values into main register. +; Params: rdi = loops +;------------------------------------------------------------------------------ +Vector64ToRegister: +_Vector64ToRegister: + add rdi, rdi +.L1: + pextrq rax, xmm1, 0 ; 256 byte chunk / 8 bytes/xfer = 32 xfers. + pextrq rax, xmm2, 1 + pextrq rax, xmm3, 0 + pextrq rax, xmm1, 1 + pextrq rax, xmm2, 0 + pextrq rax, xmm3, 1 + pextrq rax, xmm0, 0 + pextrq rax, xmm0, 1 + + pextrq rax, xmm0, 0 + pextrq rax, xmm0, 1 + pextrq rax, xmm0, 1 + pextrq rax, xmm0, 1 + pextrq rax, xmm0, 0 + pextrq rax, xmm0, 0 + pextrq rax, xmm0, 0 + pextrq rax, xmm0, 0 + + dec rdi + jnz .L1 + ret + +;------------------------------------------------------------------------------ +; Name: CopySSE +; Purpose: Copies memory chunks that are 16-byte aligned. +; Params: rdi = ptr to destination memory area +; rsi = ptr to source memory area +; rdx = length in bytes +; rcx = loops +;------------------------------------------------------------------------------ +CopySSE: +_CopySSE: + push r10 + + shr rdx, 8 ; Ensure length is multiple of 256. + shl rdx, 8 + + ; Save our non-parameter XMM registers. + sub rsp, 192 + movdqu [rsp], xmm4 + movdqu [16+rsp], xmm5 + movdqu [32+rsp], xmm6 + movdqu [48+rsp], xmm7 + movdqu [64+rsp], xmm8 + movdqu [80+rsp], xmm9 + movdqu [96+rsp], xmm10 + movdqu [112+rsp], xmm11 + movdqu [128+rsp], xmm12 + movdqu [144+rsp], xmm13 + movdqu [160+rsp], xmm14 + movdqu [176+rsp], xmm15 + +.L1: + mov r10, rdx + +.L2: + ; prefetchnta [rsi] + movdqa xmm0, [rsi] + movdqa xmm1, [16+rsi] + movdqa xmm2, [32+rsi] + movdqa xmm3, [48+rsi] + movdqa xmm4, [64+rsi] + movdqa xmm5, [80+rsi] + movdqa xmm6, [96+rsi] + movdqa xmm7, [112+rsi] + movdqa xmm8, [128+rsi] + movdqa xmm9, [144+rsi] + movdqa xmm10, [160+rsi] + movdqa xmm11, [176+rsi] + movdqa xmm12, [192+rsi] + movdqa xmm13, [208+rsi] + movdqa xmm14, [224+rsi] + movdqa xmm15, [240+rsi] + + movntdq [rdi], xmm0 + movntdq [16+rdi], xmm1 + movntdq [32+rdi], xmm2 + movntdq [48+rdi], xmm3 + movntdq [64+rdi], xmm4 + movntdq [80+rdi], xmm5 + movntdq [96+rdi], xmm6 + movntdq [112+rdi], xmm7 + movntdq [128+rdi], xmm8 + movntdq [144+rdi], xmm9 + movntdq [160+rdi], xmm10 + movntdq [176+rdi], xmm11 + movntdq [192+rdi], xmm12 + movntdq [208+rdi], xmm13 + movntdq [224+rdi], xmm14 + movntdq [240+rdi], xmm15 + + add rsi, 256 + add rdi, 256 + + sub r10, 256 + jnz .L2 + + sub rsi, rdx ; rsi now points to start. + sub rdi, rdx ; rdi now points to start. + + dec rcx + jnz .L1 + + movdqu xmm0, [rsp] + movdqu xmm1, [16+rsp] + movdqu xmm2, [32+rsp] + movdqu xmm3, [48+rsp] + movdqu xmm4, [64+rsp] + movdqu xmm5, [80+rsp] + movdqu xmm6, [96+rsp] + movdqu xmm7, [112+rsp] + movdqu xmm8, [128+rsp] + movdqu xmm9, [144+rsp] + movdqu xmm10, [160+rsp] + movdqu xmm11, [176+rsp] + movdqu xmm12, [192+rsp] + movdqu xmm13, [208+rsp] + movdqu xmm14, [224+rsp] + movdqu xmm15, [240+rsp] + add rsp, 192 + + pop r10 + + ret +
diff --git a/routinesARM.S b/routinesARM.S new file mode 100644 index 0000000..550c68e --- /dev/null +++ b/routinesARM.S
@@ -0,0 +1,644 @@ + +# ============================================================================ +# bandwidth 0.23, a benchmark to estimate memory transfer bandwidth. +# ARM assembly module. +# Copyright (C) 2010 by Zack T Smith. +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA +# +# The author may be reached at fbui@comcast.net. +# ============================================================================= + +#-------------- +# Version 0.7 +#-------------- + +#include "config.h" + +#ifdef CONFIG_CPU_ARM1136JS +.arch armv6k +.fpu softvfp +#elif CONFIG_CPU_CORTEXA9_HF +.arch armv7-a +.fpu neon +#elif CONFIG_CPU_CORTEXA9 +.arch armv7-a +.fpu softvfp +#endif + +#ifdef __thumb2__ +.syntax unified +.code 16 +#endif + +.section code + +.text +.align 2 + +.global Writer +.global RandomWriter + +.global Reader +.global RandomReader + +.global RegisterToRegister +.global StackReader +.global StackWriter + +#----------------------------------------------------------------------------- +# Name: Writer +# Purpose: Performs sequential write into memory, as fast as possible. +# Params: +# r0 = address +# r1 = length, multiple of 256 +# r2 = count +# r3 = value to write +#----------------------------------------------------------------------------- +Writer: + stmfd sp!,{r4, r5, r6, r7, r8, r9, r10, r11, r12, lr} + +# r4 = temp +# r5 = temp + + and r1, #0xffffff80 + mov r4, r0 + mov r5, r1 + + mov r6, r3 + mov r7, r3 + mov r8, r3 + mov r9, r3 + mov r10, r3 + mov r11, r3 + mov r12, r3 + +.L0: + mov r0, r4 + mov r1, r5 + +.L1: +# Does 64 transfers, 4 bytes each = 256 bytes total. +# The "stmia" instruction automatically increments r0. + stmia r0!, { r3, r6, r7, r8, r9, r10, r11, r12 } + stmia r0!, { r3, r6, r7, r8, r9, r10, r11, r12 } + stmia r0!, { r3, r6, r7, r8, r9, r10, r11, r12 } + stmia r0!, { r3, r6, r7, r8, r9, r10, r11, r12 } + stmia r0!, { r3, r6, r7, r8, r9, r10, r11, r12 } + stmia r0!, { r3, r6, r7, r8, r9, r10, r11, r12 } + stmia r0!, { r3, r6, r7, r8, r9, r10, r11, r12 } + stmia r0!, { r3, r6, r7, r8, r9, r10, r11, r12 } + + sub r1, #256 + cmp r1, #0 + bne .L1 + + sub r2, #1 + cmp r2, #0 + bne .L0 + +# return. + ldmfd sp!,{r4, r5, r6, r7, r8, r9, r10, r11, r12, pc} + +#----------------------------------------------------------------------------- +# Name: Reader +# Purpose: Performs sequential reads from memory, as fast as possible. +# Params: +# r0 = address +# r1 = length, multiple of 256 +# r2 = count +#----------------------------------------------------------------------------- +Reader: + stmfd sp!,{r4, r5, r6, r7, r8, r9, r10, r11, r12, lr} + +# r3 = temp + + and r1, #0xffffff80 + mov r4, r0 + mov r5, r1 + +.L2: + mov r0, r4 + mov r1, r5 + +.L3: +# Does 64 transfers, 4 bytes each = 256 bytes total. +# The "ldmia" instruction automatically increments r0. + + ldmia r0!, { r3, r6, r7, r8, r9, r10, r11, r12 } + ldmia r0!, { r3, r6, r7, r8, r9, r10, r11, r12 } + ldmia r0!, { r3, r6, r7, r8, r9, r10, r11, r12 } + ldmia r0!, { r3, r6, r7, r8, r9, r10, r11, r12 } + ldmia r0!, { r3, r6, r7, r8, r9, r10, r11, r12 } + ldmia r0!, { r3, r6, r7, r8, r9, r10, r11, r12 } + ldmia r0!, { r3, r6, r7, r8, r9, r10, r11, r12 } + ldmia r0!, { r3, r6, r7, r8, r9, r10, r11, r12 } + + sub r1, #256 + cmp r1, #0 + bne .L3 + + sub r2, #1 + cmp r2, #0 + bne .L2 + +# return. + ldmfd sp!,{r4, r5, r6, r7, r8, r9, r10, r11, r12, pc} + +#----------------------------------------------------------------------------- +# Name: RandomWriter +# Purpose: Performs random write into memory, as fast as possible. +# Params: +# r0 = pointer to array of chunk pointers +# r1 = # of 256-byte chunks +# r2 = # loops to do +# r3 = value to write +#----------------------------------------------------------------------------- +RandomWriter: + stmfd sp!,{r4, r5, lr} + +# r4 = temp +# r5 = temp + +.L4: + mov r5, #0 + +.L5: +# Get pointer to chunk in memory. + ldr r4, [r0, r5, LSL #2] + +# Does 64 transfers, 4 bytes each = 256 bytes total. + + str r3, [r4, #160] + str r3, [r4, #232] + str r3, [r4, #224] + str r3, [r4, #96] + str r3, [r4, #164] + str r3, [r4, #76] + str r3, [r4, #100] + str r3, [r4, #220] + str r3, [r4, #248] + str r3, [r4, #104] + str r3, [r4, #4] + str r3, [r4, #136] + str r3, [r4, #112] + str r3, [r4, #200] + str r3, [r4, #12] + str r3, [r4, #128] + str r3, [r4, #148] + str r3, [r4, #196] + str r3, [r4, #216] + str r3, [r4] + str r3, [r4, #84] + str r3, [r4, #140] + str r3, [r4, #204] + str r3, [r4, #184] + str r3, [r4, #124] + str r3, [r4, #48] + str r3, [r4, #64] + str r3, [r4, #212] + str r3, [r4, #240] + str r3, [r4, #236] + str r3, [r4, #24] + str r3, [r4, #252] + str r3, [r4, #68] + str r3, [r4, #20] + str r3, [r4, #72] + str r3, [r4, #32] + str r3, [r4, #28] + str r3, [r4, #52] + str r3, [r4, #244] + str r3, [r4, #180] + str r3, [r4, #80] + str r3, [r4, #60] + str r3, [r4, #8] + str r3, [r4, #56] + str r3, [r4, #208] + str r3, [r4, #228] + str r3, [r4, #40] + str r3, [r4, #172] + str r3, [r4, #120] + str r3, [r4, #176] + str r3, [r4, #108] + str r3, [r4, #132] + str r3, [r4, #16] + str r3, [r4, #44] + str r3, [r4, #92] + str r3, [r4, #168] + str r3, [r4, #152] + str r3, [r4, #156] + str r3, [r4, #188] + str r3, [r4, #36] + str r3, [r4, #88] + str r3, [r4, #116] + str r3, [r4, #192] + str r3, [r4, #144] + + add r5, #1 + cmp r5, r1 + bne .L5 + + sub r2, #1 + cmp r2, #0 + bne .L4 + +# return. + ldmfd sp!,{r4, r5, pc} + +#----------------------------------------------------------------------------- +# Name: RandomReader +# Purpose: Performs random reads from memory, as fast as possible. +# Params: +# r0 = pointer to array of chunk pointers +# r1 = # of 256-byte chunks +# r2 = # loops to do +#----------------------------------------------------------------------------- +RandomReader: + stmfd sp!,{r4, r5, lr} + +# r3 = temp +# r4 = temp +# r5 = temp + +.L6: + mov r5, #0 + +.L7: +# Get pointer to chunk in memory. + ldr r4, [r0, r5, LSL #2] + +# Does 64 transfers, 4 bytes each = 256 bytes total. + + ldr r3, [r4, #160] + ldr r3, [r4, #232] + ldr r3, [r4, #224] + ldr r3, [r4, #96] + ldr r3, [r4, #164] + ldr r3, [r4, #76] + ldr r3, [r4, #100] + ldr r3, [r4, #220] + ldr r3, [r4, #248] + ldr r3, [r4, #104] + ldr r3, [r4, #4] + ldr r3, [r4, #136] + ldr r3, [r4, #112] + ldr r3, [r4, #200] + ldr r3, [r4, #12] + ldr r3, [r4, #128] + ldr r3, [r4, #148] + ldr r3, [r4, #196] + ldr r3, [r4, #216] + ldr r3, [r4] + ldr r3, [r4, #84] + ldr r3, [r4, #140] + ldr r3, [r4, #204] + ldr r3, [r4, #184] + ldr r3, [r4, #124] + ldr r3, [r4, #48] + ldr r3, [r4, #64] + ldr r3, [r4, #212] + ldr r3, [r4, #240] + ldr r3, [r4, #236] + ldr r3, [r4, #24] + ldr r3, [r4, #252] + ldr r3, [r4, #68] + ldr r3, [r4, #20] + ldr r3, [r4, #72] + ldr r3, [r4, #32] + ldr r3, [r4, #28] + ldr r3, [r4, #52] + ldr r3, [r4, #244] + ldr r3, [r4, #180] + ldr r3, [r4, #80] + ldr r3, [r4, #60] + ldr r3, [r4, #8] + ldr r3, [r4, #56] + ldr r3, [r4, #208] + ldr r3, [r4, #228] + ldr r3, [r4, #40] + ldr r3, [r4, #172] + ldr r3, [r4, #120] + ldr r3, [r4, #176] + ldr r3, [r4, #108] + ldr r3, [r4, #132] + ldr r3, [r4, #16] + ldr r3, [r4, #44] + ldr r3, [r4, #92] + ldr r3, [r4, #168] + ldr r3, [r4, #152] + ldr r3, [r4, #156] + ldr r3, [r4, #188] + ldr r3, [r4, #36] + ldr r3, [r4, #88] + ldr r3, [r4, #116] + ldr r3, [r4, #192] + ldr r3, [r4, #144] + + add r5, #1 + cmp r5, r1 + bne .L7 + + sub r2, #1 + cmp r2, #0 + bne .L6 + +# return. + ldmfd sp!,{r4, r5, pc} + +#----------------------------------------------------------------------------- +# Name: RegisterToRegister +# Purpose: Performs register-to-register transfers. +# Params: +# r0 = count +#----------------------------------------------------------------------------- +RegisterToRegister: + stmfd sp!,{lr} + +# r1 = temp + +.L8: +# Does 64 transfers, 4 bytes each = 256 bytes total. + + mov r1, r2 + mov r1, r3 + mov r1, r4 + mov r1, r5 + mov r1, r6 + mov r1, r7 + mov r1, r8 + mov r1, r9 + + mov r2, r1 + mov r2, r3 + mov r2, r4 + mov r2, r5 + mov r2, r6 + mov r2, r7 + mov r2, r8 + mov r2, r9 + + mov r1, r2 + mov r1, r3 + mov r1, r4 + mov r1, r5 + mov r1, r6 + mov r1, r7 + mov r1, r8 + mov r1, r9 + + mov r1, r2 + mov r1, r3 + mov r1, r4 + mov r1, r5 + mov r1, r6 + mov r1, r7 + mov r1, r8 + mov r1, r9 + + mov r1, r2 + mov r1, r3 + mov r1, r4 + mov r1, r5 + mov r1, r6 + mov r1, r7 + mov r1, r8 + mov r1, r9 + + mov r1, r2 + mov r1, r3 + mov r1, r4 + mov r1, r5 + mov r1, r6 + mov r1, r7 + mov r1, r8 + mov r1, r9 + + mov r1, r2 + mov r1, r3 + mov r1, r4 + mov r1, r5 + mov r1, r6 + mov r1, r7 + mov r1, r8 + mov r1, r9 + + mov r1, r2 + mov r1, r3 + mov r1, r4 + mov r1, r5 + mov r1, r6 + mov r1, r7 + mov r1, r8 + mov r1, r9 + + sub r0, #1 + cmp r0, #0 + bne .L8 + +# return. + ldmfd sp!,{pc} + +#----------------------------------------------------------------------------- +# Name: StackReader +# Purpose: Performs stack-to-register transfers. +# Params: +# r0 = count +#----------------------------------------------------------------------------- +StackReader: + stmfd sp!,{lr} + +# r1 = temp + + sub sp, #32 +.L9: +# Does 64 transfers, 4 bytes each = 256 bytes total. + + ldr r1, [sp] + ldr r1, [sp, #4] + ldr r1, [sp, #8] + ldr r1, [sp, #12] + ldr r1, [sp, #16] + ldr r1, [sp, #20] + ldr r1, [sp, #24] + ldr r1, [sp, #28] + + ldr r1, [sp] + ldr r1, [sp, #4] + ldr r1, [sp, #8] + ldr r1, [sp, #12] + ldr r1, [sp, #16] + ldr r1, [sp, #20] + ldr r1, [sp, #24] + ldr r1, [sp, #28] + + ldr r1, [sp] + ldr r1, [sp, #4] + ldr r1, [sp, #8] + ldr r1, [sp, #12] + ldr r1, [sp, #16] + ldr r1, [sp, #20] + ldr r1, [sp, #24] + ldr r1, [sp, #28] + + ldr r1, [sp] + ldr r1, [sp, #4] + ldr r1, [sp, #8] + ldr r1, [sp, #12] + ldr r1, [sp, #16] + ldr r1, [sp, #20] + ldr r1, [sp, #24] + ldr r1, [sp, #28] + + ldr r1, [sp] + ldr r1, [sp, #4] + ldr r1, [sp, #8] + ldr r1, [sp, #12] + ldr r1, [sp, #16] + ldr r1, [sp, #20] + ldr r1, [sp, #24] + ldr r1, [sp, #28] + + ldr r1, [sp] + ldr r1, [sp, #4] + ldr r1, [sp, #8] + ldr r1, [sp, #12] + ldr r1, [sp, #16] + ldr r1, [sp, #20] + ldr r1, [sp, #24] + ldr r1, [sp, #28] + + ldr r1, [sp] + ldr r1, [sp, #4] + ldr r1, [sp, #8] + ldr r1, [sp, #12] + ldr r1, [sp, #16] + ldr r1, [sp, #20] + ldr r1, [sp, #24] + ldr r1, [sp, #28] + + ldr r1, [sp] + ldr r1, [sp, #4] + ldr r1, [sp, #8] + ldr r1, [sp, #12] + ldr r1, [sp, #16] + ldr r1, [sp, #20] + ldr r1, [sp, #24] + ldr r1, [sp, #28] + + sub r0, #1 + cmp r0, #0 + bne .L9 + + add sp, #32 + +# return. + ldmfd sp!,{pc} + +#----------------------------------------------------------------------------- +# Name: StackWriter +# Purpose: Performs register-to-stack transfers. +# Params: +# r0 = count +#----------------------------------------------------------------------------- +StackWriter: + stmfd sp!,{lr} + +# r1 = temp + + sub sp, #32 +.L10: +# Does 64 transfers, 4 bytes each = 256 bytes total. + + str r1, [sp] + str r1, [sp, #4] + str r1, [sp, #8] + str r1, [sp, #12] + str r1, [sp, #16] + str r1, [sp, #20] + str r1, [sp, #24] + str r1, [sp, #28] + + str r1, [sp] + str r1, [sp, #4] + str r1, [sp, #8] + str r1, [sp, #12] + str r1, [sp, #16] + str r1, [sp, #20] + str r1, [sp, #24] + str r1, [sp, #28] + + str r1, [sp] + str r1, [sp, #4] + str r1, [sp, #8] + str r1, [sp, #12] + str r1, [sp, #16] + str r1, [sp, #20] + str r1, [sp, #24] + str r1, [sp, #28] + + str r1, [sp] + str r1, [sp, #4] + str r1, [sp, #8] + str r1, [sp, #12] + str r1, [sp, #16] + str r1, [sp, #20] + str r1, [sp, #24] + str r1, [sp, #28] + + str r1, [sp] + str r1, [sp, #4] + str r1, [sp, #8] + str r1, [sp, #12] + str r1, [sp, #16] + str r1, [sp, #20] + str r1, [sp, #24] + str r1, [sp, #28] + + str r1, [sp] + str r1, [sp, #4] + str r1, [sp, #8] + str r1, [sp, #12] + str r1, [sp, #16] + str r1, [sp, #20] + str r1, [sp, #24] + str r1, [sp, #28] + + str r1, [sp] + str r1, [sp, #4] + str r1, [sp, #8] + str r1, [sp, #12] + str r1, [sp, #16] + str r1, [sp, #20] + str r1, [sp, #24] + str r1, [sp, #28] + + str r1, [sp] + str r1, [sp, #4] + str r1, [sp, #8] + str r1, [sp, #12] + str r1, [sp, #16] + str r1, [sp, #20] + str r1, [sp, #24] + str r1, [sp, #28] + + sub r0, #1 + cmp r0, #0 + bne .L10 + + add sp, #32 + +# return. + ldmfd sp!,{pc} +
diff --git a/routinesARM.asm b/routinesARM.asm new file mode 100644 index 0000000..27d4230 --- /dev/null +++ b/routinesARM.asm
@@ -0,0 +1,629 @@ + +# ============================================================================ +# bandwidth 0.23, a benchmark to estimate memory transfer bandwidth. +# ARM assembly module. +# Copyright (C) 2010 by Zack T Smith. +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA +# +# The author may be reached at fbui@comcast.net. +# ============================================================================= + +#-------------- +# Version 0.7 +#-------------- + +.arch armv5t +.fpu softvfp + +.section code + +.text +.align 2 + +.global Writer +.global RandomWriter + +.global Reader +.global RandomReader + +.global RegisterToRegister +.global StackReader +.global StackWriter + +#----------------------------------------------------------------------------- +# Name: Writer +# Purpose: Performs sequential write into memory, as fast as possible. +# Params: +# r0 = address +# r1 = length, multiple of 256 +# r2 = count +# r3 = value to write +#----------------------------------------------------------------------------- +Writer: + stmfd sp!,{r4, r5, r6, r7, r8, r9, r10, r11, r12, lr} + +# r4 = temp +# r5 = temp + + and r1, #0xffffff80 + mov r4, r0 + mov r5, r1 + + mov r6, r3 + mov r7, r3 + mov r8, r3 + mov r9, r3 + mov r10, r3 + mov r11, r3 + mov r12, r3 + +.L0: + mov r0, r4 + mov r1, r5 + +.L1: +# Does 64 transfers, 4 bytes each = 256 bytes total. +# The "stmia" instruction automatically increments r0. + stmia r0!, { r3, r6, r7, r8, r9, r10, r11, r12 } + stmia r0!, { r3, r6, r7, r8, r9, r10, r11, r12 } + stmia r0!, { r3, r6, r7, r8, r9, r10, r11, r12 } + stmia r0!, { r3, r6, r7, r8, r9, r10, r11, r12 } + stmia r0!, { r3, r6, r7, r8, r9, r10, r11, r12 } + stmia r0!, { r3, r6, r7, r8, r9, r10, r11, r12 } + stmia r0!, { r3, r6, r7, r8, r9, r10, r11, r12 } + stmia r0!, { r3, r6, r7, r8, r9, r10, r11, r12 } + + sub r1, #256 + cmp r1, #0 + bne .L1 + + sub r2, #1 + cmp r2, #0 + bne .L0 + +# return. + ldmfd sp!,{r4, r5, r6, r7, r8, r9, r10, r11, r12, pc} + +#----------------------------------------------------------------------------- +# Name: Reader +# Purpose: Performs sequential reads from memory, as fast as possible. +# Params: +# r0 = address +# r1 = length, multiple of 256 +# r2 = count +#----------------------------------------------------------------------------- +Reader: + stmfd sp!,{r4, r5, r6, r7, r8, r9, r10, r11, r12, lr} + +# r3 = temp + + and r1, #0xffffff80 + mov r4, r0 + mov r5, r1 + +.L2: + mov r0, r4 + mov r1, r5 + +.L3: +# Does 64 transfers, 4 bytes each = 256 bytes total. +# The "ldmia" instruction automatically increments r0. + + ldmia r0!, { r3, r6, r7, r8, r9, r10, r11, r12 } + ldmia r0!, { r3, r6, r7, r8, r9, r10, r11, r12 } + ldmia r0!, { r3, r6, r7, r8, r9, r10, r11, r12 } + ldmia r0!, { r3, r6, r7, r8, r9, r10, r11, r12 } + ldmia r0!, { r3, r6, r7, r8, r9, r10, r11, r12 } + ldmia r0!, { r3, r6, r7, r8, r9, r10, r11, r12 } + ldmia r0!, { r3, r6, r7, r8, r9, r10, r11, r12 } + ldmia r0!, { r3, r6, r7, r8, r9, r10, r11, r12 } + + sub r1, #256 + cmp r1, #0 + bne .L3 + + sub r2, #1 + cmp r2, #0 + bne .L2 + +# return. + ldmfd sp!,{r4, r5, r6, r7, r8, r9, r10, r11, r12, pc} + +#----------------------------------------------------------------------------- +# Name: RandomWriter +# Purpose: Performs random write into memory, as fast as possible. +# Params: +# r0 = pointer to array of chunk pointers +# r1 = # of 256-byte chunks +# r2 = # loops to do +# r3 = value to write +#----------------------------------------------------------------------------- +RandomWriter: + stmfd sp!,{r4, r5, lr} + +# r4 = temp +# r5 = temp + +.L4: + mov r5, #0 + +.L5: +# Get pointer to chunk in memory. + ldr r4, [r0, r5, LSL #2] + +# Does 64 transfers, 4 bytes each = 256 bytes total. + + str r3, [r4, #160] + str r3, [r4, #232] + str r3, [r4, #224] + str r3, [r4, #96] + str r3, [r4, #164] + str r3, [r4, #76] + str r3, [r4, #100] + str r3, [r4, #220] + str r3, [r4, #248] + str r3, [r4, #104] + str r3, [r4, #4] + str r3, [r4, #136] + str r3, [r4, #112] + str r3, [r4, #200] + str r3, [r4, #12] + str r3, [r4, #128] + str r3, [r4, #148] + str r3, [r4, #196] + str r3, [r4, #216] + str r3, [r4] + str r3, [r4, #84] + str r3, [r4, #140] + str r3, [r4, #204] + str r3, [r4, #184] + str r3, [r4, #124] + str r3, [r4, #48] + str r3, [r4, #64] + str r3, [r4, #212] + str r3, [r4, #240] + str r3, [r4, #236] + str r3, [r4, #24] + str r3, [r4, #252] + str r3, [r4, #68] + str r3, [r4, #20] + str r3, [r4, #72] + str r3, [r4, #32] + str r3, [r4, #28] + str r3, [r4, #52] + str r3, [r4, #244] + str r3, [r4, #180] + str r3, [r4, #80] + str r3, [r4, #60] + str r3, [r4, #8] + str r3, [r4, #56] + str r3, [r4, #208] + str r3, [r4, #228] + str r3, [r4, #40] + str r3, [r4, #172] + str r3, [r4, #120] + str r3, [r4, #176] + str r3, [r4, #108] + str r3, [r4, #132] + str r3, [r4, #16] + str r3, [r4, #44] + str r3, [r4, #92] + str r3, [r4, #168] + str r3, [r4, #152] + str r3, [r4, #156] + str r3, [r4, #188] + str r3, [r4, #36] + str r3, [r4, #88] + str r3, [r4, #116] + str r3, [r4, #192] + str r3, [r4, #144] + + add r5, #1 + cmp r5, r1 + bne .L5 + + sub r2, #1 + cmp r2, #0 + bne .L4 + +# return. + ldmfd sp!,{r4, r5, pc} + +#----------------------------------------------------------------------------- +# Name: RandomReader +# Purpose: Performs random reads from memory, as fast as possible. +# Params: +# r0 = pointer to array of chunk pointers +# r1 = # of 256-byte chunks +# r2 = # loops to do +#----------------------------------------------------------------------------- +RandomReader: + stmfd sp!,{r4, r5, lr} + +# r3 = temp +# r4 = temp +# r5 = temp + +.L6: + mov r5, #0 + +.L7: +# Get pointer to chunk in memory. + ldr r4, [r0, r5, LSL #2] + +# Does 64 transfers, 4 bytes each = 256 bytes total. + + ldr r3, [r4, #160] + ldr r3, [r4, #232] + ldr r3, [r4, #224] + ldr r3, [r4, #96] + ldr r3, [r4, #164] + ldr r3, [r4, #76] + ldr r3, [r4, #100] + ldr r3, [r4, #220] + ldr r3, [r4, #248] + ldr r3, [r4, #104] + ldr r3, [r4, #4] + ldr r3, [r4, #136] + ldr r3, [r4, #112] + ldr r3, [r4, #200] + ldr r3, [r4, #12] + ldr r3, [r4, #128] + ldr r3, [r4, #148] + ldr r3, [r4, #196] + ldr r3, [r4, #216] + ldr r3, [r4] + ldr r3, [r4, #84] + ldr r3, [r4, #140] + ldr r3, [r4, #204] + ldr r3, [r4, #184] + ldr r3, [r4, #124] + ldr r3, [r4, #48] + ldr r3, [r4, #64] + ldr r3, [r4, #212] + ldr r3, [r4, #240] + ldr r3, [r4, #236] + ldr r3, [r4, #24] + ldr r3, [r4, #252] + ldr r3, [r4, #68] + ldr r3, [r4, #20] + ldr r3, [r4, #72] + ldr r3, [r4, #32] + ldr r3, [r4, #28] + ldr r3, [r4, #52] + ldr r3, [r4, #244] + ldr r3, [r4, #180] + ldr r3, [r4, #80] + ldr r3, [r4, #60] + ldr r3, [r4, #8] + ldr r3, [r4, #56] + ldr r3, [r4, #208] + ldr r3, [r4, #228] + ldr r3, [r4, #40] + ldr r3, [r4, #172] + ldr r3, [r4, #120] + ldr r3, [r4, #176] + ldr r3, [r4, #108] + ldr r3, [r4, #132] + ldr r3, [r4, #16] + ldr r3, [r4, #44] + ldr r3, [r4, #92] + ldr r3, [r4, #168] + ldr r3, [r4, #152] + ldr r3, [r4, #156] + ldr r3, [r4, #188] + ldr r3, [r4, #36] + ldr r3, [r4, #88] + ldr r3, [r4, #116] + ldr r3, [r4, #192] + ldr r3, [r4, #144] + + add r5, #1 + cmp r5, r1 + bne .L7 + + sub r2, #1 + cmp r2, #0 + bne .L6 + +# return. + ldmfd sp!,{r4, r5, pc} + +#----------------------------------------------------------------------------- +# Name: RegisterToRegister +# Purpose: Performs register-to-register transfers. +# Params: +# r0 = count +#----------------------------------------------------------------------------- +RegisterToRegister: + stmfd sp!,{lr} + +# r1 = temp + +.L8: +# Does 64 transfers, 4 bytes each = 256 bytes total. + + mov r1, r2 + mov r1, r3 + mov r1, r4 + mov r1, r5 + mov r1, r6 + mov r1, r7 + mov r1, r8 + mov r1, r9 + + mov r2, r1 + mov r2, r3 + mov r2, r4 + mov r2, r5 + mov r2, r6 + mov r2, r7 + mov r2, r8 + mov r2, r9 + + mov r1, r2 + mov r1, r3 + mov r1, r4 + mov r1, r5 + mov r1, r6 + mov r1, r7 + mov r1, r8 + mov r1, r9 + + mov r1, r2 + mov r1, r3 + mov r1, r4 + mov r1, r5 + mov r1, r6 + mov r1, r7 + mov r1, r8 + mov r1, r9 + + mov r1, r2 + mov r1, r3 + mov r1, r4 + mov r1, r5 + mov r1, r6 + mov r1, r7 + mov r1, r8 + mov r1, r9 + + mov r1, r2 + mov r1, r3 + mov r1, r4 + mov r1, r5 + mov r1, r6 + mov r1, r7 + mov r1, r8 + mov r1, r9 + + mov r1, r2 + mov r1, r3 + mov r1, r4 + mov r1, r5 + mov r1, r6 + mov r1, r7 + mov r1, r8 + mov r1, r9 + + mov r1, r2 + mov r1, r3 + mov r1, r4 + mov r1, r5 + mov r1, r6 + mov r1, r7 + mov r1, r8 + mov r1, r9 + + sub r0, #1 + cmp r0, #0 + bne .L8 + +# return. + ldmfd sp!,{pc} + +#----------------------------------------------------------------------------- +# Name: StackReader +# Purpose: Performs stack-to-register transfers. +# Params: +# r0 = count +#----------------------------------------------------------------------------- +StackReader: + stmfd sp!,{lr} + +# r1 = temp + + sub sp, #32 +.L9: +# Does 64 transfers, 4 bytes each = 256 bytes total. + + ldr r1, [sp] + ldr r1, [sp, #4] + ldr r1, [sp, #8] + ldr r1, [sp, #12] + ldr r1, [sp, #16] + ldr r1, [sp, #20] + ldr r1, [sp, #24] + ldr r1, [sp, #28] + + ldr r1, [sp] + ldr r1, [sp, #4] + ldr r1, [sp, #8] + ldr r1, [sp, #12] + ldr r1, [sp, #16] + ldr r1, [sp, #20] + ldr r1, [sp, #24] + ldr r1, [sp, #28] + + ldr r1, [sp] + ldr r1, [sp, #4] + ldr r1, [sp, #8] + ldr r1, [sp, #12] + ldr r1, [sp, #16] + ldr r1, [sp, #20] + ldr r1, [sp, #24] + ldr r1, [sp, #28] + + ldr r1, [sp] + ldr r1, [sp, #4] + ldr r1, [sp, #8] + ldr r1, [sp, #12] + ldr r1, [sp, #16] + ldr r1, [sp, #20] + ldr r1, [sp, #24] + ldr r1, [sp, #28] + + ldr r1, [sp] + ldr r1, [sp, #4] + ldr r1, [sp, #8] + ldr r1, [sp, #12] + ldr r1, [sp, #16] + ldr r1, [sp, #20] + ldr r1, [sp, #24] + ldr r1, [sp, #28] + + ldr r1, [sp] + ldr r1, [sp, #4] + ldr r1, [sp, #8] + ldr r1, [sp, #12] + ldr r1, [sp, #16] + ldr r1, [sp, #20] + ldr r1, [sp, #24] + ldr r1, [sp, #28] + + ldr r1, [sp] + ldr r1, [sp, #4] + ldr r1, [sp, #8] + ldr r1, [sp, #12] + ldr r1, [sp, #16] + ldr r1, [sp, #20] + ldr r1, [sp, #24] + ldr r1, [sp, #28] + + ldr r1, [sp] + ldr r1, [sp, #4] + ldr r1, [sp, #8] + ldr r1, [sp, #12] + ldr r1, [sp, #16] + ldr r1, [sp, #20] + ldr r1, [sp, #24] + ldr r1, [sp, #28] + + sub r0, #1 + cmp r0, #0 + bne .L9 + + add sp, #32 + +# return. + ldmfd sp!,{pc} + +#----------------------------------------------------------------------------- +# Name: StackWriter +# Purpose: Performs register-to-stack transfers. +# Params: +# r0 = count +#----------------------------------------------------------------------------- +StackWriter: + stmfd sp!,{lr} + +# r1 = temp + + sub sp, #32 +.L10: +# Does 64 transfers, 4 bytes each = 256 bytes total. + + str r1, [sp] + str r1, [sp, #4] + str r1, [sp, #8] + str r1, [sp, #12] + str r1, [sp, #16] + str r1, [sp, #20] + str r1, [sp, #24] + str r1, [sp, #28] + + str r1, [sp] + str r1, [sp, #4] + str r1, [sp, #8] + str r1, [sp, #12] + str r1, [sp, #16] + str r1, [sp, #20] + str r1, [sp, #24] + str r1, [sp, #28] + + str r1, [sp] + str r1, [sp, #4] + str r1, [sp, #8] + str r1, [sp, #12] + str r1, [sp, #16] + str r1, [sp, #20] + str r1, [sp, #24] + str r1, [sp, #28] + + str r1, [sp] + str r1, [sp, #4] + str r1, [sp, #8] + str r1, [sp, #12] + str r1, [sp, #16] + str r1, [sp, #20] + str r1, [sp, #24] + str r1, [sp, #28] + + str r1, [sp] + str r1, [sp, #4] + str r1, [sp, #8] + str r1, [sp, #12] + str r1, [sp, #16] + str r1, [sp, #20] + str r1, [sp, #24] + str r1, [sp, #28] + + str r1, [sp] + str r1, [sp, #4] + str r1, [sp, #8] + str r1, [sp, #12] + str r1, [sp, #16] + str r1, [sp, #20] + str r1, [sp, #24] + str r1, [sp, #28] + + str r1, [sp] + str r1, [sp, #4] + str r1, [sp, #8] + str r1, [sp, #12] + str r1, [sp, #16] + str r1, [sp, #20] + str r1, [sp, #24] + str r1, [sp, #28] + + str r1, [sp] + str r1, [sp, #4] + str r1, [sp, #8] + str r1, [sp, #12] + str r1, [sp, #16] + str r1, [sp, #20] + str r1, [sp, #24] + str r1, [sp, #28] + + sub r0, #1 + cmp r0, #0 + bne .L10 + + add sp, #32 + +# return. + ldmfd sp!,{pc} +
diff --git a/routinesARM64.S b/routinesARM64.S new file mode 100644 index 0000000..fa1f8b7 --- /dev/null +++ b/routinesARM64.S
@@ -0,0 +1,483 @@ + +# ============================================================================ +# bandwidth 0.23, a benchmark to estimate memory transfer bandwidth. +# ARM assembly module. +# Copyright (C) 2010 by Zack T Smith. +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA +# +# The author may be reached at fbui@comcast.net. +# ============================================================================= + +#-------------- +# Version 0.7 +#-------------- + +#include "config.h" + +.arch armv8-a + +#ifdef __thumb2__ +.syntax unified +.code 16 +#endif + +.section code + +.text +.align 2 + +.global Writer +.global RandomWriter + +.global Reader +.global RandomReader + +.global RegisterToRegister +.global StackReader +.global StackWriter + +#----------------------------------------------------------------------------- +# Name: Writer +# Purpose: Performs sequential write into memory, as fast as possible. +# Params: +# x0 = address +# x1 = length, multiple of 256 +# x2 = loop +# x3 = value to write +#----------------------------------------------------------------------------- +Writer: + stp x29, x30, [sp, #-16]! + + bic x1, x1, #0x7f + mov x4, x0 + mov x5, x1 + + mov x6, x3 + +# x4 = temp address +# x5 = temp length + +.L0: + mov x0, x4 + mov x1, x5 + +.L1: + stp x3, x6, [x0] + stp x3, x6, [x0, #16] + stp x3, x6, [x0, #32] + stp x3, x6, [x0, #48] + stp x3, x6, [x0, #64] + stp x3, x6, [x0, #80] + stp x3, x6, [x0, #96] + stp x3, x6, [x0, #112] + stp x3, x6, [x0, #128] + stp x3, x6, [x0, #144] + stp x3, x6, [x0, #160] + stp x3, x6, [x0, #176] + stp x3, x6, [x0, #192] + stp x3, x6, [x0, #208] + stp x3, x6, [x0, #224] + stp x3, x6, [x0, #240] + add x0, x0, #256 + + sub x1, x1, #256 + cbnz x1, .L1 + + sub x2, x2, #1 + cbnz x2, .L0 + +# return. + ldp x29, x30, [sp], #16 + ret + +#----------------------------------------------------------------------------- +# Name: Reader +# Purpose: Performs sequential reads from memory, as fast as possible. +# Params: +# x0 = address +# x1 = length, multiple of 256 +# x2 = loop +#----------------------------------------------------------------------------- +Reader: + stp x29, x30, [sp, #-16]! + stp x20, x21, [sp, #-16]! + stp x18, x19, [sp, #-16]! + stp x16, x17, [sp, #-16]! + stp x14, x15, [sp, #-16]! + stp x12, x13, [sp, #-16]! + stp x10, x11, [sp, #-16]! + stp x8, x9, [sp, #-16]! + + bic x1, x1, #0x7f + mov x4, x0 + mov x5, x1 + +# x4 = temp address +# x5 = temp length + +.L2: + mov x0, x4 + mov x1, x5 + +.L3: + ldp x3, x6, [x0] + ldp x7, x8, [x0, #16] + ldp x9, x10, [x0, #32] + ldp x11, x12, [x0, #48] + ldp x13, x14, [x0, #64] + ldp x15, x16, [x0, #80] + ldp x17, x18, [x0, #96] + ldp x19, x20, [x0, #112] + ldp x21, x6, [x0, #128] + ldp x7, x8, [x0, #144] + ldp x9, x10, [x0, #160] + ldp x11, x12, [x0, #176] + ldp x13, x14, [x0, #192] + ldp x15, x16, [x0, #208] + ldp x17, x18, [x0, #224] + ldp x19, x20, [x0, #240] + add x0, x0, #256 + + sub x1, x1, #256 + cbnz x1, .L3 + + sub x2, x2, #1 + cbnz x2, .L2 + +# return. + ldp x8, x9, [sp], #16 + ldp x10, x11, [sp], #16 + ldp x12, x13, [sp], #16 + ldp x14, x15, [sp], #16 + ldp x16, x17, [sp], #16 + ldp x18, x19, [sp], #16 + ldp x20, x21, [sp], #16 + ldp x29, x30, [sp], #16 + ret + +#----------------------------------------------------------------------------- +# Name: RandomWriter +# Purpose: Performs random write into memory, as fast as possible. +# Params: +# x0 = pointer to array of chunk pointers +# x1 = # of 256-byte chunks +# x2 = # loops to do +# x3 = value to write +#----------------------------------------------------------------------------- +RandomWriter: + stp x29, x30, [sp, #-16]! + +# x4 = temp +# x5 = temp + +.L4: + mov x5, #0 + +.L5: +# Get pointer to chunk in memory. Note, 64-bit pointers. + ldr x4, [x0, x5, LSL #3] + +# Does 32 transfers, 8 bytes each = 256 bytes total. + + str x3, [x4, #160] + str x3, [x4, #232] + str x3, [x4, #224] + str x3, [x4, #96] + str x3, [x4, #168] + str x3, [x4, #80] + str x3, [x4, #104] + str x3, [x4, #248] + str x3, [x4, #8] + str x3, [x4, #136] + str x3, [x4, #112] + str x3, [x4, #200] + str x3, [x4, #128] + str x3, [x4, #152] + str x3, [x4, #216] + str x3, [x4] + str x3, [x4, #88] + str x3, [x4, #144] + str x3, [x4, #208] + str x3, [x4, #184] + str x3, [x4, #48] + str x3, [x4, #64] + str x3, [x4, #240] + str x3, [x4, #24] + str x3, [x4, #72] + str x3, [x4, #32] + str x3, [x4, #56] + str x3, [x4, #16] + str x3, [x4, #40] + str x3, [x4, #176] + str x3, [x4, #120] + str x3, [x4, #192] + + add x5, x5, #1 + cmp x5, x1 + bne .L5 + + sub x2, x2, #1 + cbnz x2, .L4 + +# return. + ldp x29, x30, [sp], #16 + ret + +#----------------------------------------------------------------------------- +# Name: RandomReader +# Purpose: Performs random reads from memory, as fast as possible. +# Params: +# x0 = pointer to array of chunk pointers +# x1 = # of 256-byte chunks +# x2 = # loops to do +#----------------------------------------------------------------------------- +RandomReader: + stp x29, x30, [sp, #-16]! + +# x4 = temp +# x5 = temp + +.L6: + mov x5, #0 + +.L7: +# Get pointer to chunk in memory. Note, 64-bit pointers. + ldr x4, [x0, x5, LSL #3] + +# Does 32 transfers, 8 bytes each = 256 bytes total. + + ldr x3, [x4, #160] + ldr x3, [x4, #232] + ldr x3, [x4, #224] + ldr x3, [x4, #96] + ldr x3, [x4, #168] + ldr x3, [x4, #80] + ldr x3, [x4, #104] + ldr x3, [x4, #248] + ldr x3, [x4, #8] + ldr x3, [x4, #136] + ldr x3, [x4, #112] + ldr x3, [x4, #200] + ldr x3, [x4, #128] + ldr x3, [x4, #152] + ldr x3, [x4, #216] + ldr x3, [x4] + ldr x3, [x4, #88] + ldr x3, [x4, #144] + ldr x3, [x4, #208] + ldr x3, [x4, #184] + ldr x3, [x4, #48] + ldr x3, [x4, #64] + ldr x3, [x4, #240] + ldr x3, [x4, #24] + ldr x3, [x4, #72] + ldr x3, [x4, #32] + ldr x3, [x4, #56] + ldr x3, [x4, #16] + ldr x3, [x4, #40] + ldr x3, [x4, #176] + ldr x3, [x4, #120] + ldr x3, [x4, #192] + + add x5, x5, #1 + cmp x5, x1 + bne .L7 + + sub x2, x2, #1 + cbnz x2, .L6 + +# return. + ldp x29, x30, [sp], #16 + ret + +#----------------------------------------------------------------------------- +# Name: RegisterToRegister +# Purpose: Performs register-to-register transfers. +# Params: +# x0 = count +#----------------------------------------------------------------------------- +RegisterToRegister: + stp x29, x30, [sp, #-16]! + +# x1 = temp + +.L8: +# Does 32 transfers, 8 bytes each = 256 bytes total. + + mov x1, x2 + mov x1, x3 + mov x1, x4 + mov x1, x5 + mov x1, x6 + mov x1, x7 + mov x1, x8 + mov x1, x9 + + mov x2, x1 + mov x2, x3 + mov x2, x4 + mov x2, x5 + mov x2, x6 + mov x2, x7 + mov x2, x8 + mov x2, x9 + + mov x1, x2 + mov x1, x3 + mov x1, x4 + mov x1, x5 + mov x1, x6 + mov x1, x7 + mov x1, x8 + mov x1, x9 + + mov x1, x2 + mov x1, x3 + mov x1, x4 + mov x1, x5 + mov x1, x6 + mov x1, x7 + mov x1, x8 + mov x1, x9 + + + sub x0, x0, #1 + cbnz x0, .L8 + +# return. + ldp x29, x30, [sp], #16 + ret + +#----------------------------------------------------------------------------- +# Name: StackReader +# Purpose: Performs stack-to-register transfers. +# Params: +# x0 = count +#----------------------------------------------------------------------------- +StackReader: + stp x29, x30, [sp, #-16]! + +# x1 = temp + + sub sp, sp, #64 +.L9: +# Does 32 transfers, 8 bytes each = 256 bytes total. + + ldr x1, [sp] + ldr x1, [sp, #8] + ldr x1, [sp, #16] + ldr x1, [sp, #24] + ldr x1, [sp, #32] + ldr x1, [sp, #40] + ldr x1, [sp, #48] + ldr x1, [sp, #56] + + ldr x1, [sp] + ldr x1, [sp, #8] + ldr x1, [sp, #16] + ldr x1, [sp, #24] + ldr x1, [sp, #32] + ldr x1, [sp, #40] + ldr x1, [sp, #48] + ldr x1, [sp, #56] + + ldr x1, [sp] + ldr x1, [sp, #8] + ldr x1, [sp, #16] + ldr x1, [sp, #24] + ldr x1, [sp, #32] + ldr x1, [sp, #40] + ldr x1, [sp, #48] + ldr x1, [sp, #56] + + ldr x1, [sp] + ldr x1, [sp, #8] + ldr x1, [sp, #16] + ldr x1, [sp, #24] + ldr x1, [sp, #32] + ldr x1, [sp, #40] + ldr x1, [sp, #48] + ldr x1, [sp, #56] + + sub x0, x0, #1 + cbnz x0, .L9 + + add sp, sp, #64 + +# return. + ldp x29, x30, [sp], #16 + ret + +#----------------------------------------------------------------------------- +# Name: StackWriter +# Purpose: Performs register-to-stack transfers. +# Params: +# x0 = count +#----------------------------------------------------------------------------- +StackWriter: + stp x29, x30, [sp, #-16]! + +# x1 = temp + + sub sp, sp, #64 +.L10: +# Does 32 transfers, 8 bytes each = 256 bytes total. + + str x1, [sp] + str x1, [sp, #8] + str x1, [sp, #16] + str x1, [sp, #24] + str x1, [sp, #32] + str x1, [sp, #40] + str x1, [sp, #48] + str x1, [sp, #56] + + str x1, [sp] + str x1, [sp, #8] + str x1, [sp, #16] + str x1, [sp, #24] + str x1, [sp, #32] + str x1, [sp, #40] + str x1, [sp, #48] + str x1, [sp, #56] + + str x1, [sp] + str x1, [sp, #8] + str x1, [sp, #16] + str x1, [sp, #24] + str x1, [sp, #32] + str x1, [sp, #40] + str x1, [sp, #48] + str x1, [sp, #56] + + str x1, [sp] + str x1, [sp, #8] + str x1, [sp, #16] + str x1, [sp, #24] + str x1, [sp, #32] + str x1, [sp, #40] + str x1, [sp, #48] + str x1, [sp, #56] + + sub x0, x0, #1 + cbnz x0, .L10 + + add sp, sp, #64 + +# return. + ldp x29, x30, [sp], #16 + ret +