Project import generated by Copybara.

NOKEYCHECK=True
GitOrigin-RevId: dcbe0211d22b840a0543aea2f5415be7d645a689
diff --git a/BMP.c b/BMP.c
new file mode 100644
index 0000000..cadb718
--- /dev/null
+++ b/BMP.c
@@ -0,0 +1,1413 @@
+
+/*=============================================================================
+  bmplib, a simple library to create, modify, and write BMP image files.
+  Copyright (C) 2009-2010 by Zack T Smith.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License version 2 
+  as published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software
+  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+
+  The author may be reached at fbui@comcast.net.
+ *============================================================================*/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "BMP.h"
+
+// Mini characters, 8 pixels high.
+static char* mini_chars [] = 
+{
+	"#",
+	"#",
+	"#",
+	"#",
+	"#",
+	" ",
+	"#",
+	"",
+
+	"## ##",
+	" #  #",
+	"#  #",
+	"  ",
+	"  ",
+	"  ",
+	"  ",
+	"",
+
+	" # # ",
+	" # # ",
+	"#####",
+	" # # ",
+	"#####",
+	" # # ",
+	" # # ",
+	"",
+
+	"  #  ",
+	" ####",
+	"# #  ",
+	" ### ",
+	"  # #",
+	"####",
+	"  #  ",
+	"",
+
+	"##  #",
+	"    #",
+	"   #",
+	"  #",
+	" #",
+	"#",
+	"#  ##",
+	"",
+
+	" #   ",
+	"# #  ",
+	"##   ",
+	" ## #",
+	"# ## ",
+	"#  # ",
+	" ## #",
+	"",
+
+	"##",
+	" #",
+	"#",
+	"",
+	"",
+	"",
+	"",
+	"",
+
+	" #",
+	"#",
+	"#",
+	"#",
+	"#",
+	"#",
+	"#",
+	" #",
+
+	"# ",
+	" #",
+	" #",
+	" #",
+	" #",
+	" #",
+	" #",
+	"#",
+
+	"     ",
+	"# # #",
+	" ###",
+	"  #",
+	" ###",
+	"# # #",
+	"",
+	"",
+
+	"     ",
+	"  #",
+	"  #",
+	"#####",
+	"  #",
+	"  #",
+	"",
+	"",
+
+	"  ",
+	"",
+	"",
+	"",
+	"",
+	"##",
+	" #",
+	"#",
+
+	"     ",
+	"",
+	"",
+	"#####",
+	"",
+	"",
+	"",
+	"",
+
+	" ",
+	"",
+	"",
+	"",
+	"",
+	"",
+	"#",
+	"",
+
+	"    #",
+	"    #",
+	"   #",
+	"  #",
+	" #",
+	"#",
+	"#",
+	"",
+
+	" ## ",
+	"#  #",
+	"#  #",
+	"#  #",
+	"#  #",
+	"#  #",
+	" ## ",
+	"",
+
+	" #",
+	"##",
+	" #",
+	" #",
+	" #",
+	" #",
+	" #",
+	"",
+
+	" ## ",
+	"#  #",
+	"   #",
+	" ###",
+	"#   ",
+	"#   ",
+	"####",
+	"",
+
+	"####",
+	"   #",
+	"  # ",
+	" ## ",
+	"   #",
+	"#  #",
+	" ## ",
+	"",
+
+	"# # ",
+	"# #",
+	"# #",
+	"####",
+	"  #",
+	"  #",
+	"  #",
+	"",
+
+	"####",
+	"#   ",
+	"### ",
+	"   #",
+	"   #",
+	"#  #",
+	" ## ",
+	"",
+
+	" ## ",
+	"#   ",
+	"#   ",
+	"### ",
+	"#  #",
+	"#  #",
+	" ## ",
+	"",
+
+	"####",
+	"   #",
+	"   #",
+	"  # ",
+	"  # ",
+	"  # ",
+	"  # ",
+	"",
+
+	" ## ",
+	"#  #",
+	"#  #",
+	" ## ",
+	"#  #",
+	"#  #",
+	" ## ",
+	"",
+
+	" ## ",
+	"#  #",
+	"#  #",
+	" ###",
+	"   #",
+	"  # ",
+	" #  ",
+	"",
+
+	" ",
+	"",
+	"",
+	"#",
+	"",
+	"#",
+	"",
+	"",
+
+	"  ",
+	"",
+	"  ",
+	"##",
+	"  ",
+	"##",
+	" #",
+	"#",
+
+	"   #",
+	"  #",
+	" #",
+	"#",
+	" #",
+	"  #",
+	"   #",
+	"",
+
+	"     ",
+	"",
+	"",
+	"#####",
+	"  ",
+	"#####",
+	"",
+	"",
+
+	"#   ",
+	" #",
+	"  #",
+	"   #",
+	"  #",
+	" #",
+	"#",
+	"",
+
+	" ### ",
+	"#   #",
+	"    #",
+	"  ## ",
+	"  #",
+	"",
+	"  #",
+	"",
+
+	" ### ",
+	"#   #",
+	"# .##",
+	"# # #",
+	"# .##",
+	"#    ",
+	" ###",
+	"",
+
+	"  #  ",
+	" # # ",
+	"#   #",
+	"#   #",
+	"#####",
+	"#   #",
+	"#   #",
+	"",
+
+	"#### ",
+	"#   #",
+	"#   #",
+	"#### ",
+	"#   #",
+	"#   #",
+	"####",
+	"",
+
+	" ### ",
+	"#   #",
+	"#    ",
+	"#    ",
+	"#    ",
+	"#   #",
+	" ###",
+	"",
+
+	"#### ",
+	"#   #",
+	"#   #",
+	"#   #",
+	"#   #",
+	"#   #",
+	"####",
+	"",
+
+	"#####",
+	"#",
+	"#",
+	"###",
+	"#",
+	"#",
+	"#####",
+	"",
+
+	"#####",
+	"#    ",
+	"#    ",
+	"###",
+	"#    ",
+	"#    ",
+	"#",
+	"",
+
+	" ### ",
+	"#   #",
+	"#    ",
+	"#  ##",
+	"#   #",
+	"#   #",
+	" ###.",
+	"",
+
+	"#   #",
+	"#   #",
+	"#   #",
+	"#####",
+	"#   #",
+	"#   #",
+	"#   #",
+	"",
+
+	"###",
+	" #",
+	" #",
+	" #",
+	" #",
+	" #",
+	"###",
+	"",
+
+	"  ###",
+	"   #",
+	"   #",
+	"   #",
+	"   #",
+	"#  #",
+	" ##",
+	"",
+
+	"#   #",
+	"#  #",
+	"# #",
+	"##",
+	"# #",
+	"#  #",
+	"#   #",
+	"",
+	
+	"#    ",
+	"#",
+	"#",
+	"#",
+	"#",
+	"#",
+	"#####",
+	"",
+
+	"#   #",
+	"## ##",
+	"# # #",
+	"#   #",
+	"#   #",
+	"#   #",
+	"#   #",
+	"",
+	
+	"#   #",
+	"##  #",
+	"# # #",
+	"#  ##",
+	"#   #",
+	"#   #",
+	"#   #",
+	"",
+	
+	" ### ",
+	"#   #",
+	"#   #",
+	"#   #",
+	"#   #",
+	"#   #",
+	" ###",
+	"",
+
+	"#### ",
+	"#   #",
+	"#   #",
+	"#### ",
+	"#    ",
+	"#    ",
+	"#    ",
+	"",
+	
+	" ### ",
+	"#   #",
+	"#   #",
+	"#   #",
+	"# # #",
+	"#  # ",
+	" ## #",
+	"",
+
+	"#### ",
+	"#   #",
+	"#   #",
+	"#### ",
+	"# #  ",
+	"#  # ",
+	"#   #",
+	"",
+	
+	" ### ",
+	"#   #",
+	"#    ",
+	" ### ",
+	"    #",
+	"#   #",
+	" ###",
+	"",
+
+	"#####",
+	"  #",
+	"  #",
+	"  #",
+	"  #",
+	"  #",
+	"  #",
+	"",
+
+	"#   #",
+	"#   #",
+	"#   #",
+	"#   #",
+	"#   #",
+	"#   #",
+	" ###",
+	"",
+
+	"#   #",
+	"#   #",
+	"#   #",
+	"#   #",
+	"#   #",
+	" # # ",
+	"  #",
+	"",
+
+	"#   #",
+	"#   #",
+	"#   #",
+	"# . #",
+	"# # #",
+	"## ##",
+	"#   #",
+	"",
+
+	"#   #",
+	"#   #",
+	" # #",
+	"  #",
+	" # #",
+	"#   #",
+	"#   #",
+	"",
+
+	"#   #",
+	"#   #",
+	"#   #",
+	" # #",
+	"  #",
+	"  #",
+	"  #",
+	"",
+
+	"#####",
+	"    #",
+	"   #",
+	"  #",
+	" #",
+	"#",
+	"#####",
+	"",
+
+	"##",
+	"#",
+	"#",
+	"#",
+	"#",
+	"#",
+	"#",
+	"##",
+
+	"#    ",
+	"#",
+	" #",
+	"  #",
+	"   #",
+	"    #",
+	"    #",
+	"",
+
+	"##",
+	" #",
+	" #",
+	" #",
+	" #",
+	" #",
+	" #",
+	"##",
+
+	"  #  ",
+	" #.#",
+	"#   #",
+	"",
+	"",
+	"",
+	"",
+	"",
+
+	"    ",
+	"",
+	"",
+	"",
+	"",
+	"",
+	"",
+	"####",
+
+	"##",
+	"#",
+	" #",
+	"",
+	"",
+	"",
+	"",
+	"",
+
+	"    ",
+	"    ",
+	" ## ",
+	"   #",
+	" ###",
+	"#  #",
+	".###",
+	"",
+
+	"#   ",
+	"#   ",
+	"### ",
+	"#  #",
+	"#  #",
+	"#  #",
+	"### ",
+	"",
+
+	"    ",
+	"    ",
+	" ###",
+	"#   ",
+	"#   ",
+	"#   ",
+	" ###",
+	"",
+
+	"   #",
+	"   #",
+	" ###",
+	"#  #",
+	"#  #",
+	"#  #",
+	" ###",
+	"",
+
+	"    ",
+	"    ",
+	" ## ",
+	"#  #",
+	"####",
+	"#   ",
+	" ###",
+	"",
+
+	"  ##",
+	" #  ",
+	"### ",
+	" #  ",
+	" #  ",
+	" #  ",
+	"### ",
+	"",
+
+	"    ",
+	"    ",
+	" ###",
+	"#  #",
+	"#  #",
+	" ###",
+	"   #",
+	"### ",
+
+	"#   ",
+	"#   ",
+	"### ",
+	"#  #",
+	"#  #",
+	"#  #",
+	"#  #",
+	"",
+
+	" # ",
+	"   ",
+	"## ",
+	" # ",
+	" # ",
+	" # ",
+	"###",
+	"",
+
+	"  #",
+	"   ",
+	" ##",
+	"  #",
+	"  #",
+	"  #",
+	"  #",
+	"## ",
+
+	"#   ",
+	"#   ",
+	"#  #",
+	"# # ",
+	"##  ",
+	"# # ",
+	"#  #",
+	"",
+
+	"## ",
+	" # ",
+	" # ",
+	" # ",
+	" # ",
+	" # ",
+	"###",
+	"",
+
+	"     ",
+	"",
+	"####",
+	"# # #",
+	"# # #",
+	"# # #",
+	"# # #",
+	"",
+
+	"    ",
+	"    ",
+	"###",
+	"#  #",
+	"#  #",
+	"#  #",
+	"#  #",
+	"",
+
+	"    ",
+	"    ",
+	" ## ",
+	"#  #",
+	"#  #",
+	"#  #",
+	" ## ",
+	"",
+
+	"    ",
+	"",
+	"###",
+	"#  #",
+	"#  #",
+	"###",
+	"#",
+	"#",
+
+	"    ",
+	"",
+	" ###",
+	"#  #",
+	"#  #",
+	" ###",
+	"   #",
+	"   # ",
+
+	"    ",
+	"    ",
+	"# ##",
+	"## ",
+	"#  ",
+	"#  ",
+	"#  ",
+	"",
+
+	"    ",
+	"    ",
+	" ###",
+	"#  ",
+	" ##",
+	"   #",
+	"### ",
+	"",
+
+	" # ",
+	" #",
+	"###",
+	" #",
+	" #",
+	" #",
+	" ##",
+	"",
+
+	"    ",
+	"",
+	"#  #",
+	"#  #",
+	"#  #",
+	"#  #",
+	" ###",
+	"",
+
+	"    ",
+	"",
+	"#  #",
+	"#  #",
+	"#  #",
+	" # #",
+	"  #",
+	"",
+
+	"     ",
+	"",
+	"# # #",
+	"# # #",
+	"# # #",
+	"# # #",
+	" # #",
+	"",
+
+	"     ",
+	"",
+	"#   #",
+	" # #",
+	"  #",
+	" # #",
+	"#   #",
+	"",
+
+	"    ",
+	"    ",
+	"#  #",
+	"#  #",
+	"#  #",
+	" ###",
+	"   #",
+	"### ",
+
+	"     ",
+	"",
+	"#####",
+	"   #",
+	"  #",
+	" # ",
+	"#####",
+	"",
+
+
+};
+
+
+// Narrowest possible numbers.
+static char* narrow_nums [] = 
+{
+	" # ",
+	"# #",
+	"# #",
+	"# #",
+	"# #",
+	"# #",
+	" # ",
+
+	" #",
+	"##",
+	" #",
+	" #",
+	" #",
+	" #",
+	" #",
+
+	" # ",
+	"# #",
+	"  #",
+	" ##",
+	"#  ",
+	"#  ",
+	"###",
+
+	"###",
+	"  #",
+	" # ",
+	"## ",
+	"  #",
+	"# #",
+	" # ",
+
+	"# #",
+	"# #",
+	"# #",
+	"###",
+	"  #",
+	"  #",
+	"  #",
+
+	"###",
+	"#  ",
+	"## ",
+	"  #",
+	"  #",
+	"# #",
+	" # ",
+
+
+	" # ",
+	"#  ",
+	"#  ",
+	"## ",
+	"# #",
+	"# #",
+	" # ",
+
+	"###",
+	"  #",
+	"  #",
+	" # ",
+	" # ",
+	" # ",
+	" # ",
+
+	" # ",
+	"# #",
+	"# #",
+	" # ",
+	"# #",
+	"# #",
+	" # ",
+
+	" # ",
+	"# #",
+	"# #",
+	" ##",
+	"  #",
+	" # ",
+	"#  ",
+
+	" ",
+	"",
+	"",
+	" ",
+	"",
+	"",
+	"#",
+};
+
+
+/*===========================================================================
+ * Name:	BMP_new
+ * Purpose:	Creates new image.
+ */
+BMP* 
+BMP_new (int w, int h)
+{
+	unsigned long size;
+	BMP* nu;
+	if (w<1 || h<1)
+		return NULL;
+	//----------
+
+	if (w & 3) 
+		w += 4 - (w & 3);
+	if (h & 3) 
+		h += 4 - (h & 3);
+
+	nu = (BMP*) malloc (sizeof (BMP));
+	if (!nu)
+		return NULL;
+	memset (nu, 0, sizeof (BMP));
+	nu->width = w;
+	nu->height = h;
+	size = w * h * sizeof (long);
+	nu->pixels = (unsigned long*) malloc (size);
+	if (!nu->pixels) {
+		free (nu);
+		return NULL;
+	}
+	memset (nu->pixels, 0, size);
+	return nu;
+}
+
+/*===========================================================================
+ * Name:	BMP_delete
+ * Purpose:	Deallocates image.
+ */
+void 
+BMP_delete (BMP* bmp)
+{
+	if (!bmp)
+		return;
+	//----------
+
+	if (bmp->pixels)
+		free (bmp->pixels);
+	free (bmp);
+}
+
+/*===========================================================================
+ * Name:	BMP_point
+ * Purpose:	Writes pixel into image.
+ */
+void
+BMP_point (BMP *bmp, int x, int y, unsigned long rgb)
+{
+	if (!bmp || x<0 || y<0)
+		return;
+	if (x >= bmp->width || y >= bmp->height)
+		return;
+	if (!bmp->pixels)
+		return;
+	//----------
+
+	bmp->pixels[y*bmp->width + x] = rgb;
+}
+
+/*===========================================================================
+ * Name:	BMP_line
+ * Purpose:	Draws a line in a BMP image.
+ */
+void
+BMP_line (BMP *bmp, int x0, int y0, int x1, int y1, unsigned long rgb)
+{
+	if ((rgb >> 24) == 0xff)
+		return;
+
+	if (x0 == x1 && y0 == y1) 
+		BMP_point (bmp, x0, y0, rgb);
+	else if (x0 == x1)
+		BMP_vline (bmp, x0, y0, y1, rgb);
+	else if (y0 == y1)
+		BMP_hline (bmp, x0, x1, y0, rgb);
+	else {
+		int j, x, y, dx, dy, e, xchange, s1, s2;
+
+		// DDA, copied from my FramebufferUI project.
+
+		x = x0;
+		y = y0;
+		s1 = 1;
+		s2 = 1;
+
+		dx = x1 - x0;
+		if (dx < 0) {
+			dx = -dx;
+			s1 = -1;
+		}
+
+		dy = y1 - y0;
+		if (dy < 0) {
+			dy = -dy;
+			s2 = -1;
+		}
+
+		xchange = 0;
+
+		if (dy > dx) {
+			int tmp = dx;
+			dx = dy;
+			dy = tmp;
+			xchange = 1;
+		}
+
+		e = (dy<<1) - dx;
+		j = 0;
+
+		while (j <= dx) {
+			j++;
+
+			BMP_point (bmp, x, y, rgb);
+
+			if (e >= 0) {
+				if (xchange)
+					x += s1;
+				else
+					y += s2;
+				e -= (dx << 1);
+			}
+			if (xchange) 
+				y += s2;
+			else
+				x += s1;
+			e += (dy << 1);
+		}
+	}
+}
+
+/*===========================================================================
+ * Name:	BMP_rect
+ * Purpose:	Fills a rectangle with a color.
+ */
+void
+BMP_rect (BMP *bmp, int x, int y, int w, int h, unsigned long rgb)
+{
+	BMP_hline (bmp, x, x+w-1, y, rgb);
+	BMP_hline (bmp, x, x+w-1, y+h-1, rgb);
+	BMP_vline (bmp, x, y, y+h-1, rgb);
+	BMP_vline (bmp, x+w-1, y, y+h-1, rgb);
+}
+
+/*===========================================================================
+ * Name:	BMP_fillrect
+ * Purpose:	Fills a rectangle with a color.
+ */
+void
+BMP_fillrect (BMP *bmp, int x, int y, int w, int h, unsigned long rgb)
+{
+	while (h > 0) {
+		BMP_hline (bmp, x, x+w-1, y, rgb);
+		h--;
+		y++;
+	}
+}
+
+/*===========================================================================
+ * Name:	BMP_clear
+ * Purpose:	Sets all pixels to specified color.
+ */
+void
+BMP_clear (BMP *bmp, unsigned long rgb)
+{
+	BMP_fillrect (bmp, 0, 0, bmp->width, bmp->height, rgb);
+}
+
+/*===========================================================================
+ * Name:	BMP_hline
+ * Purpose:	Draws horizontal line.
+ */
+void
+BMP_hline (BMP *bmp, int x0, int x1, int y, unsigned long rgb)
+{
+	if (x0 > x1) {
+		int tmp=x1;
+		x1=x0;
+		x0=tmp;
+	}
+	
+	while (x0 <= x1) {
+		BMP_point (bmp, x0++, y, rgb);
+	}
+}
+
+/*===========================================================================
+ * Name:	BMP_vline
+ * Purpose:	Draws vertical line.
+ */
+void
+BMP_vline (BMP *bmp, int x, int y0, int y1, unsigned long rgb)
+{
+	if (y0 > y1) {
+		int tmp=y1;
+		y1=y0;
+		y0=tmp;
+	}
+	
+	while (y0 <= y1) {
+		BMP_point (bmp, x, y0++, rgb);
+	}
+}
+
+/*===========================================================================
+ * Name:	BMP_draw_mini_string
+ * Purpose:	Draws miniature 5x8 characters into the image.
+ * Note:	Full ASCII character set not supported.
+ */
+int
+BMP_draw_mini_string (BMP *bmp, char *string, int x, int y, unsigned long color)
+{
+	char ch, *s;
+	unsigned long r,g,b;
+	unsigned long light;
+
+	if (!bmp || !string)
+		return 0;
+	if (x >= bmp->width || y >= bmp->height || !*string)
+		return 0;
+	//----------
+
+	r = 0xff & (color >> 16);
+	g = 0xff & (color >> 8);
+	b = 0xff & color;
+	r += 3*0xff;
+	b += 3*0xff;
+	g += 3*0xff;
+	r /= 4;
+	g /= 4;
+	b /= 4;
+	light = b | (g << 8) | (r << 16);
+
+#define MINI_HEIGHT (8)
+	s = string;
+	while ((ch = *s++)) {
+		int ix = -1;
+		if (ch == ' ') {
+			x += 5;
+			continue;
+		}
+		if (ch > 'z')
+			continue;
+		if (ch > ' ' && ch <= 'z')
+			ix = MINI_HEIGHT * (ch - 33);
+		
+		if (ix >= 0) {
+			int i;
+			int width = strlen (mini_chars[ix]);
+
+			for (i=0; i<MINI_HEIGHT; i++) {
+				int j=0;
+				char ch2, *s2 = mini_chars[ix + i];
+				while ((ch2 = *s2++)) {
+					switch (ch2) {
+					case '#':
+						BMP_point (bmp,x+j, y+i, color);
+						break;
+					case '.':
+						BMP_point (bmp,x+j, y+i, light);
+						break;
+					}
+					j++;
+				}
+			}
+
+			x += width + 1;
+		}
+	}
+
+	return x;
+}
+
+/*===========================================================================
+ * Name:	BMP_mini_string_width
+ * Purpose:	Gets width of miniature 5x8 characters.
+ * Note:	Full ASCII character set not supported.
+ */
+int
+BMP_mini_string_width (char *string)
+{
+	char ch, *s;
+	int width = 0;
+
+	if (!string)
+		return 0;
+	//----------
+
+	s = string;
+	while ((ch = *s++)) {
+		int ix = -1;
+		if (ch == ' ') {
+			width += 5;
+			continue;
+		}
+		if (ch > 'z')
+			continue;
+		if (ch > ' ' && ch <= 'z')
+			ix = MINI_HEIGHT * (ch - 33);
+		
+		if (ix >= 0) {
+			int w = strlen (mini_chars[ix]);
+
+			width += w + 1;
+		}
+	}
+
+	return width;
+}
+
+/*===========================================================================
+ * Name:	BMP_narrow_numbers
+ * Purpose:	Draws miniature 4x7 characters into the image.
+ * Note:	Full ASCII character set not supported.
+ */
+int
+BMP_draw_narrow_numbers (BMP *bmp, char *string, int x, int y, unsigned long color)
+{
+	char ch, *s;
+
+	if (!bmp || !string)
+		return 0;
+	if (x >= bmp->width || y >= bmp->height || !*string)
+		return 0;
+	//----------
+
+#define NARROW_HEIGHT (7)
+	s = string;
+	while ((ch = *s++)) {
+		int ix = -1;
+		if (ch == ' ') {
+			x += 3;
+			continue;
+		}
+		if (ch >= '0' && ch <= '9')
+			ix = ch - '0';
+		else
+		if (ch == '.')
+			ix = 10;
+		
+		ix *= NARROW_HEIGHT;
+		
+		if (ix >= 0) {
+			int i;
+			int width = strlen (narrow_nums [ix]);
+
+			for (i=0; i<NARROW_HEIGHT; i++) {
+				int j=0;
+				char ch2, *s2 = narrow_nums [ix + i];
+				while ((ch2 = *s2++)) {
+					if (ch2 == '#') {
+						BMP_point (bmp, 
+							x+j, y+i, color);
+					}
+					j++;
+				}
+			}
+
+			x += width + 1;
+		}
+	}
+
+	return x;
+}
+
+/*===========================================================================
+ * Name:	BMP_getpixel
+ * Purpose:	Reads pixel out of image.
+ */
+unsigned long
+BMP_getpixel (BMP *bmp, int x, int y)
+{
+	if (!bmp || x<0 || y<0)
+		return 0;
+	if (x >= bmp->width || y >= bmp->height)
+		return 0;
+	if (!bmp->pixels)
+		return 0;
+	//----------
+
+	return bmp->pixels[y*bmp->width + x];
+}
+
+/*===========================================================================
+ * Name:	BMP_write
+ * Purpose:	Writes image to BMP file.
+ */
+int 
+BMP_write (BMP* bmp, char *path)
+{
+	FILE *f;
+#define HDRLEN (54)
+	unsigned char h[HDRLEN];
+	unsigned long len;
+	int i, j;
+
+	if (!bmp || !path)
+		return -1;
+	//----------
+
+	memset (h, 0, HDRLEN);
+
+	//----------------------------------------
+	// Create the file.
+	//
+	f = fopen (path, "wb");
+	if (!f)
+		return 0;
+
+	//----------------------------------------
+	// Prepare header
+	//
+	len = HDRLEN + 3 * bmp->width * bmp->height;
+	h[0] = 'B';
+	h[1] = 'M';
+	h[2] = len & 0xff;
+	h[3] = (len >> 8) & 0xff;
+	h[4] = (len >> 16) & 0xff;
+	h[5] = (len >> 24) & 0xff;
+	h[10] = HDRLEN;
+	h[14] = 40;
+	h[18] = bmp->width & 0xff;
+	h[19] = (bmp->width >> 8) & 0xff;
+	h[20] = (bmp->width >> 16) & 0xff;
+	h[22] = bmp->height & 0xff;
+	h[23] = (bmp->height >> 8) & 0xff;
+	h[24] = (bmp->height >> 16) & 0xff;
+	h[26] = 1;
+	h[28] = 24;
+	h[34] = 16;
+	h[36] = 0x13; // 2835 pixels/meter
+	h[37] = 0x0b;
+	h[42] = 0x13; // 2835 pixels/meter
+	h[43] = 0x0b;
+
+	//----------------------------------------
+	// Write header.
+	//
+	if (HDRLEN != fwrite (h, 1, HDRLEN, f)) {
+		fclose (f);
+		return 0;
+	}
+
+	//----------------------------------------
+	// Write pixels.
+	// Note that BMP has lower rows first.
+	//
+	for (j=bmp->height-1; j >= 0; j--) {
+		for (i=0; i < bmp->width; i++) {
+			unsigned char rgb[3];
+			int ix = i + j * bmp->width;
+			unsigned long pixel = bmp->pixels[ix];
+			rgb[0] = pixel & 0xff;
+			rgb[1] = (pixel >> 8) & 0xff;
+			rgb[2] = (pixel >> 16) & 0xff;
+			if (3 != fwrite (rgb, 1, 3, f)) {
+				fclose (f);
+				return 0;
+			}
+		}
+	}
+
+	fclose (f);
+	return 1;
+}
+
+
diff --git a/BMP.h b/BMP.h
new file mode 100644
index 0000000..a86858a
--- /dev/null
+++ b/BMP.h
@@ -0,0 +1,66 @@
+
+/*=============================================================================
+  bmplib, a simple library to create, modify, and write BMP image files.
+  Copyright (C) 2009 by Zack T Smith.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License version 2 
+  as published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software
+  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+
+  The author may be reached at fbui@comcast.net.
+ *============================================================================*/
+
+#ifndef _BMP_H
+#define _BMP_H
+
+typedef struct {
+	int width, height;
+	unsigned long *pixels;
+} BMP;
+
+#define MINIFONT_HEIGHT (8)
+
+extern BMP* BMP_new (int, int);
+extern void BMP_delete (BMP*);
+extern void BMP_clear (BMP*, unsigned long);
+extern int BMP_write (BMP*, char *path);
+extern void BMP_point (BMP*, int, int, unsigned long);
+extern void BMP_line (BMP *, int x0, int y0, int x1, int y1, unsigned long);
+extern void BMP_hline (BMP *, int x0, int x1, int y, unsigned long);
+extern void BMP_vline (BMP *, int x, int y0, int y1, unsigned long);
+extern void BMP_rect (BMP *, int x, int y, int w, int h, unsigned long);
+extern void BMP_fillrect (BMP *, int x, int y, int w, int h, unsigned long);
+extern unsigned long BMP_getpixel (BMP*, int, int);
+extern int BMP_draw_mini_string (BMP *, char *, int x, int y, unsigned long);
+extern int BMP_mini_string_width (char *);
+
+#define RGB_BLACK (0)
+#define RGB_GRAY (0xc0c0c0)
+#define RGB_RED (0xff0000)
+#define RGB_MAGENTA (0xff00ff)
+#define RGB_GREEN (0xff00)
+#define RGB_DARKGREEN (0x6400)
+#define RGB_BLUE (0xff)
+#define RGB_WHITE (0xffffff)
+#define RGB_YELLOW (0xffff00)
+#define RGB_CYAN (0xffff)
+#define RGB_NAVYBLUE (0x80)
+#define RGB_ORANGE (0xffa500)
+#define RGB_DARKORANGE (0xff8c00)
+#define RGB_PURPLE (0xa020f0)
+#define RGB_MAROON (0x800000)
+#define RGB_SALMON (0xfa8072)
+#define RGB_BRASS (0xc3a368)
+#define RGB_LEMONYELLOW (0xfde910)
+
+#endif
+
diff --git a/COPYING.txt b/COPYING.txt
new file mode 100644
index 0000000..3912109
--- /dev/null
+++ b/COPYING.txt
@@ -0,0 +1,340 @@
+		    GNU GENERAL PUBLIC LICENSE
+		       Version 2, June 1991
+
+ Copyright (C) 1989, 1991 Free Software Foundation, Inc.
+                       51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+			    Preamble
+
+  The licenses for most software are designed to take away your
+freedom to share and change it.  By contrast, the GNU General Public
+License is intended to guarantee your freedom to share and change free
+software--to make sure the software is free for all its users.  This
+General Public License applies to most of the Free Software
+Foundation's software and to any other program whose authors commit to
+using it.  (Some other Free Software Foundation software is covered by
+the GNU Library General Public License instead.)  You can apply it to
+your programs, too.
+
+  When we speak of free software, we are referring to freedom, not
+price.  Our General Public Licenses are designed to make sure that you
+have the freedom to distribute copies of free software (and charge for
+this service if you wish), that you receive source code or can get it
+if you want it, that you can change the software or use pieces of it
+in new free programs; and that you know you can do these things.
+
+  To protect your rights, we need to make restrictions that forbid
+anyone to deny you these rights or to ask you to surrender the rights.
+These restrictions translate to certain responsibilities for you if you
+distribute copies of the software, or if you modify it.
+
+  For example, if you distribute copies of such a program, whether
+gratis or for a fee, you must give the recipients all the rights that
+you have.  You must make sure that they, too, receive or can get the
+source code.  And you must show them these terms so they know their
+rights.
+
+  We protect your rights with two steps: (1) copyright the software, and
+(2) offer you this license which gives you legal permission to copy,
+distribute and/or modify the software.
+
+  Also, for each author's protection and ours, we want to make certain
+that everyone understands that there is no warranty for this free
+software.  If the software is modified by someone else and passed on, we
+want its recipients to know that what they have is not the original, so
+that any problems introduced by others will not reflect on the original
+authors' reputations.
+
+  Finally, any free program is threatened constantly by software
+patents.  We wish to avoid the danger that redistributors of a free
+program will individually obtain patent licenses, in effect making the
+program proprietary.  To prevent this, we have made it clear that any
+patent must be licensed for everyone's free use or not licensed at all.
+
+  The precise terms and conditions for copying, distribution and
+modification follow.
+
+		    GNU GENERAL PUBLIC LICENSE
+   TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
+
+  0. This License applies to any program or other work which contains
+a notice placed by the copyright holder saying it may be distributed
+under the terms of this General Public License.  The "Program", below,
+refers to any such program or work, and a "work based on the Program"
+means either the Program or any derivative work under copyright law:
+that is to say, a work containing the Program or a portion of it,
+either verbatim or with modifications and/or translated into another
+language.  (Hereinafter, translation is included without limitation in
+the term "modification".)  Each licensee is addressed as "you".
+
+Activities other than copying, distribution and modification are not
+covered by this License; they are outside its scope.  The act of
+running the Program is not restricted, and the output from the Program
+is covered only if its contents constitute a work based on the
+Program (independent of having been made by running the Program).
+Whether that is true depends on what the Program does.
+
+  1. You may copy and distribute verbatim copies of the Program's
+source code as you receive it, in any medium, provided that you
+conspicuously and appropriately publish on each copy an appropriate
+copyright notice and disclaimer of warranty; keep intact all the
+notices that refer to this License and to the absence of any warranty;
+and give any other recipients of the Program a copy of this License
+along with the Program.
+
+You may charge a fee for the physical act of transferring a copy, and
+you may at your option offer warranty protection in exchange for a fee.
+
+  2. You may modify your copy or copies of the Program or any portion
+of it, thus forming a work based on the Program, and copy and
+distribute such modifications or work under the terms of Section 1
+above, provided that you also meet all of these conditions:
+
+    a) You must cause the modified files to carry prominent notices
+    stating that you changed the files and the date of any change.
+
+    b) You must cause any work that you distribute or publish, that in
+    whole or in part contains or is derived from the Program or any
+    part thereof, to be licensed as a whole at no charge to all third
+    parties under the terms of this License.
+
+    c) If the modified program normally reads commands interactively
+    when run, you must cause it, when started running for such
+    interactive use in the most ordinary way, to print or display an
+    announcement including an appropriate copyright notice and a
+    notice that there is no warranty (or else, saying that you provide
+    a warranty) and that users may redistribute the program under
+    these conditions, and telling the user how to view a copy of this
+    License.  (Exception: if the Program itself is interactive but
+    does not normally print such an announcement, your work based on
+    the Program is not required to print an announcement.)
+
+These requirements apply to the modified work as a whole.  If
+identifiable sections of that work are not derived from the Program,
+and can be reasonably considered independent and separate works in
+themselves, then this License, and its terms, do not apply to those
+sections when you distribute them as separate works.  But when you
+distribute the same sections as part of a whole which is a work based
+on the Program, the distribution of the whole must be on the terms of
+this License, whose permissions for other licensees extend to the
+entire whole, and thus to each and every part regardless of who wrote it.
+
+Thus, it is not the intent of this section to claim rights or contest
+your rights to work written entirely by you; rather, the intent is to
+exercise the right to control the distribution of derivative or
+collective works based on the Program.
+
+In addition, mere aggregation of another work not based on the Program
+with the Program (or with a work based on the Program) on a volume of
+a storage or distribution medium does not bring the other work under
+the scope of this License.
+
+  3. You may copy and distribute the Program (or a work based on it,
+under Section 2) in object code or executable form under the terms of
+Sections 1 and 2 above provided that you also do one of the following:
+
+    a) Accompany it with the complete corresponding machine-readable
+    source code, which must be distributed under the terms of Sections
+    1 and 2 above on a medium customarily used for software interchange; or,
+
+    b) Accompany it with a written offer, valid for at least three
+    years, to give any third party, for a charge no more than your
+    cost of physically performing source distribution, a complete
+    machine-readable copy of the corresponding source code, to be
+    distributed under the terms of Sections 1 and 2 above on a medium
+    customarily used for software interchange; or,
+
+    c) Accompany it with the information you received as to the offer
+    to distribute corresponding source code.  (This alternative is
+    allowed only for noncommercial distribution and only if you
+    received the program in object code or executable form with such
+    an offer, in accord with Subsection b above.)
+
+The source code for a work means the preferred form of the work for
+making modifications to it.  For an executable work, complete source
+code means all the source code for all modules it contains, plus any
+associated interface definition files, plus the scripts used to
+control compilation and installation of the executable.  However, as a
+special exception, the source code distributed need not include
+anything that is normally distributed (in either source or binary
+form) with the major components (compiler, kernel, and so on) of the
+operating system on which the executable runs, unless that component
+itself accompanies the executable.
+
+If distribution of executable or object code is made by offering
+access to copy from a designated place, then offering equivalent
+access to copy the source code from the same place counts as
+distribution of the source code, even though third parties are not
+compelled to copy the source along with the object code.
+
+  4. You may not copy, modify, sublicense, or distribute the Program
+except as expressly provided under this License.  Any attempt
+otherwise to copy, modify, sublicense or distribute the Program is
+void, and will automatically terminate your rights under this License.
+However, parties who have received copies, or rights, from you under
+this License will not have their licenses terminated so long as such
+parties remain in full compliance.
+
+  5. You are not required to accept this License, since you have not
+signed it.  However, nothing else grants you permission to modify or
+distribute the Program or its derivative works.  These actions are
+prohibited by law if you do not accept this License.  Therefore, by
+modifying or distributing the Program (or any work based on the
+Program), you indicate your acceptance of this License to do so, and
+all its terms and conditions for copying, distributing or modifying
+the Program or works based on it.
+
+  6. Each time you redistribute the Program (or any work based on the
+Program), the recipient automatically receives a license from the
+original licensor to copy, distribute or modify the Program subject to
+these terms and conditions.  You may not impose any further
+restrictions on the recipients' exercise of the rights granted herein.
+You are not responsible for enforcing compliance by third parties to
+this License.
+
+  7. If, as a consequence of a court judgment or allegation of patent
+infringement or for any other reason (not limited to patent issues),
+conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License.  If you cannot
+distribute so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you
+may not distribute the Program at all.  For example, if a patent
+license would not permit royalty-free redistribution of the Program by
+all those who receive copies directly or indirectly through you, then
+the only way you could satisfy both it and this License would be to
+refrain entirely from distribution of the Program.
+
+If any portion of this section is held invalid or unenforceable under
+any particular circumstance, the balance of the section is intended to
+apply and the section as a whole is intended to apply in other
+circumstances.
+
+It is not the purpose of this section to induce you to infringe any
+patents or other property right claims or to contest validity of any
+such claims; this section has the sole purpose of protecting the
+integrity of the free software distribution system, which is
+implemented by public license practices.  Many people have made
+generous contributions to the wide range of software distributed
+through that system in reliance on consistent application of that
+system; it is up to the author/donor to decide if he or she is willing
+to distribute software through any other system and a licensee cannot
+impose that choice.
+
+This section is intended to make thoroughly clear what is believed to
+be a consequence of the rest of this License.
+
+  8. If the distribution and/or use of the Program is restricted in
+certain countries either by patents or by copyrighted interfaces, the
+original copyright holder who places the Program under this License
+may add an explicit geographical distribution limitation excluding
+those countries, so that distribution is permitted only in or among
+countries not thus excluded.  In such case, this License incorporates
+the limitation as if written in the body of this License.
+
+  9. The Free Software Foundation may publish revised and/or new versions
+of the General Public License from time to time.  Such new versions will
+be similar in spirit to the present version, but may differ in detail to
+address new problems or concerns.
+
+Each version is given a distinguishing version number.  If the Program
+specifies a version number of this License which applies to it and "any
+later version", you have the option of following the terms and conditions
+either of that version or of any later version published by the Free
+Software Foundation.  If the Program does not specify a version number of
+this License, you may choose any version ever published by the Free Software
+Foundation.
+
+  10. If you wish to incorporate parts of the Program into other free
+programs whose distribution conditions are different, write to the author
+to ask for permission.  For software which is copyrighted by the Free
+Software Foundation, write to the Free Software Foundation; we sometimes
+make exceptions for this.  Our decision will be guided by the two goals
+of preserving the free status of all derivatives of our free software and
+of promoting the sharing and reuse of software generally.
+
+			    NO WARRANTY
+
+  11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
+FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW.  EXCEPT WHEN
+OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
+PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED
+OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.  THE ENTIRE RISK AS
+TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU.  SHOULD THE
+PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING,
+REPAIR OR CORRECTION.
+
+  12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
+WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
+REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES,
+INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING
+OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED
+TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY
+YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER
+PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGES.
+
+		     END OF TERMS AND CONDITIONS
+
+	    How to Apply These Terms to Your New Programs
+
+  If you develop a new program, and you want it to be of the greatest
+possible use to the public, the best way to achieve this is to make it
+free software which everyone can redistribute and change under these terms.
+
+  To do so, attach the following notices to the program.  It is safest
+to attach them to the start of each source file to most effectively
+convey the exclusion of warranty; and each file should have at least
+the "copyright" line and a pointer to where the full notice is found.
+
+    <one line to give the program's name and a brief idea of what it does.>
+    Copyright (C) <year>  <name of author>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program; if not, write to the Free Software
+    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+
+
+Also add information on how to contact you by electronic and paper mail.
+
+If the program is interactive, make it output a short notice like this
+when it starts in an interactive mode:
+
+    Gnomovision version 69, Copyright (C) year name of author
+    Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
+    This is free software, and you are welcome to redistribute it
+    under certain conditions; type `show c' for details.
+
+The hypothetical commands `show w' and `show c' should show the appropriate
+parts of the General Public License.  Of course, the commands you use may
+be called something other than `show w' and `show c'; they could even be
+mouse-clicks or menu items--whatever suits your program.
+
+You should also get your employer (if you work as a programmer) or your
+school, if any, to sign a "copyright disclaimer" for the program, if
+necessary.  Here is a sample; alter the names:
+
+  Yoyodyne, Inc., hereby disclaims all copyright interest in the program
+  `Gnomovision' (which makes passes at compilers) written by James Hacker.
+
+  <signature of Ty Coon>, 1 April 1989
+  Ty Coon, President of Vice
+
+This General Public License does not permit incorporating your program into
+proprietary programs.  If your program is a subroutine library, you may
+consider it more useful to permit linking proprietary applications with the
+library.  If this is what you want to do, use the GNU Library General
+Public License instead of this License.
diff --git a/README.txt b/README.txt
new file mode 100644
index 0000000..5acb2ff
--- /dev/null
+++ b/README.txt
@@ -0,0 +1,145 @@
+
+This is the Readme file for my program
+called "bandwidth".
+
+Bandwidth is a benchmark that attempts to measure
+primarily memory bandwidth. In December 2010 and
+as of release 0.24a, I have extended 'bandwidth'
+to measure network bandwidth as well.
+
+It's useful because hardware specifications are 
+sometimes incomplete or misleading.
+
+--------------------------------------------------
+MEMORY BANDWIDTH 
+
+Bandwidth performs sequential and random
+reads and writes of varying sizes. This permits 
+you to see in the numbers how each type of memory 
+is performing.  So for instance when bandwidth
+writes a 256-byte chunk, you know that because
+caches are normally write-back, this chunk
+will reside entirely in the L1 cache. Whereas
+a 512 kB chunk will mainly reside in L2.
+
+You could run a non-artificial benchmark and 
+observe that a general performance number is lower 
+on that machine, but that conceals the cause. 
+So the purpose of this program is to help you 
+pinpoint the cause of a performance problem,
+and determine whether it is memory related.
+It also tells you the best-case scenario i.e.
+the maximum bandwidth achieved using sequential,
+128-bit memory accesses.
+
+Version 0.24 adds network bandwidth testing.
+
+Version 0.23 adds:
+- Mac OS/X 64-bit support.
+- Vector-to-vector register transfer test.
+- Main register to/from vector register transfer test.
+- Main register byte/word/dword/qword to/from 
+  vector register test (pinsr*, pextr* instructions).
+- Memory copy test using SSE2.
+- Automatic checks under Linux for SSE2 & SSE4.
+
+Version 0.22 adds:
+- Register-to-register transfer test.
+- Register-to/from-stack transfer tests.
+
+Version 0.21 adds:
+- Standardized memory chunks to always be
+  a multiple of 256-byte mini-chunks.
+- Random memory accesses, in which each 
+  256-byte mini-chunk accessed is accessed 
+  in a random order, but also, inside each 
+  mini-chunk the 32/64/128 data are accessed
+  pseudo-randomly as well. 
+- Now 'bandwidth' includes chunk sizes that 
+  are not powers of 2, which increases 
+  data points around the key chunk sizes 
+  corresponding to common L1 and L2 cache 
+  sizes.
+- Command-line options:
+	--quick for 0.25 seconds per test.
+	--slow for 20 seconds per test.
+	--title for adding a graph title.
+
+Version 0.20 added graphing, with the graph
+stored in a BMP image file. It also adds the
+--slow option for more precise runs.
+
+Version 0.19 added a second 128-bit SSE writer
+routine that bypasses the caches, in addition
+to the one that doesn't.
+
+Version 0.18 was my Grand Unified bandwidth
+benchmark that brought together support for
+four operating systems:
+	- Linux
+	- Windows Mobile
+	- 32-bit Windows
+	- Mac OS/X 64-bit
+and three processor architectures:
+	- x86
+	- Intel64
+	- ARM 
+I've written custom assembly routines for
+each architecture.
+
+Total run time for the default speed, which
+has 5 seconds per test, is about 35 minutes.
+
+--------------------------------------------------
+NETWORK BANDWIDTH (beginning with release 0.24a)
+
+In December 2010, I extended bandwidth to measure
+network bandwidth, which is useful for testing
+your home or work network setup, and in theory
+could be used to test larger networks as well.
+
+The network test is pretty simple. It sends chunks
+of data of varying sizes to whatever computers
+(nodes) that you specify. Each of those must be
+running 'bandwidth' in transponder mode.
+
+The chunks of data range of 8 kB up to 32 MB.
+
+Sample output:
+	output/Network-Mac-Linux-Win32.txt
+
+How to start a transponder:
+	./bandwidth-mac64 --transponder
+
+Example invocation of the test leader:
+	./bandwidth64 --network 192.168.1.104
+
+Areas for improvement:
+
+At present, the output of this test is not graphed.
+
+At present, the 'leader' in the test interacts
+with the specified nodes by sending data but
+not by receiving it.
+
+At present, the specified nodes do not interact
+with one another as part of the test.
+
+At present, it is not known whether the network
+code will work on ARM devices.
+I've only tested it on 
+	Linux 32-bit
+	Mac OS/X 32- and 64-bit
+	Win/Cygwin 32-bit.
+
+At present, it uses port 49000 but later
+the port will be specifiable.
+
+--------------------------------------------------
+This program is provided without any warranty
+and AS-IS. See the file COPYING for details.
+
+Zack Smith
+fbui@comcast.net
+December 2010
+
diff --git a/defs.h b/defs.h
new file mode 100644
index 0000000..eb269bc
--- /dev/null
+++ b/defs.h
@@ -0,0 +1,94 @@
+/*============================================================================
+  bandwidth 0.23, a benchmark to estimate memory transfer bandwidth.
+  Copyright (C) 2005-2010 by Zack T Smith.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation; either version 2 of the License, or
+  (at your option) any later version.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software
+  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+
+  The author may be reached at fbui@comcast.net.
+ *===========================================================================*/
+
+//---------------------------------------------------------------------------
+// Change log
+// 0.18	Grand unified version supports x86/intel64/arm, linux/win32/winmo.
+// 0.19	Now have 128-bit writer that goes to cache AND one that bypasses.
+// 0.20	Added my bmplib and graphing of output. Also added --slow option.
+// 0.21	Adds random testing. Min chunk size = 256 B. Allows non-2^n chunks.
+// 0.22	Adds register-to-register and register-to/from-stack transfers.
+// 0.23	Adds vector-to-vector and register-to-vector transfers, & Mac support.
+// 0.24	Adds network bandwidth tests from this PC to specified others.
+//---------------------------------------------------------------------------
+
+#ifndef _DEFS_H
+#define _DEFS_H
+
+#define VERSION "0.24a"
+#define VERSION_W L"0.24a"
+#define APPNAME L"Bandwidth WinMo "VERSION_W
+
+#ifndef bool
+typedef char bool;
+enum { true = 1, false = 0 };
+#endif
+
+#define NETWORK_DEFAULT_PORTNUM (49000)
+#define NETWORK_CHUNK_SIZE (8192)
+
+extern int Reader (void *ptr, unsigned long size, unsigned long loops);
+extern int RandomReader (void *ptr, unsigned long n_chunks, unsigned long loops);
+
+extern int Writer (void *ptr, unsigned long size, unsigned long loops, unsigned long value);
+extern int RandomWriter (void *ptr, unsigned long size, unsigned long loops, unsigned long value);
+
+extern int RegisterToRegister (unsigned long);
+
+extern int StackReader (unsigned long);
+extern int StackWriter (unsigned long);
+
+#ifndef __arm__
+extern int RegisterToVector (unsigned long);	// SSE2
+extern int Register8ToVector (unsigned long);	// SSE2
+extern int Register16ToVector (unsigned long);	// SSE2
+extern int Register32ToVector (unsigned long);	// SSE2
+extern int Register64ToVector (unsigned long);	// SSE2
+
+extern int VectorToVector (unsigned long);	// SSE2
+
+extern int VectorToRegister (unsigned long);	// SSE2
+extern int Vector8ToRegister (unsigned long);	// SSE2
+extern int Vector16ToRegister (unsigned long);	// SSE2
+extern int Vector32ToRegister (unsigned long);	// SSE2
+extern int Vector64ToRegister (unsigned long);	// SSE2
+
+extern int CopySSE (void*, void*, unsigned long, unsigned long);	// SSE2
+extern int Copy (void*, void*, unsigned long, unsigned long);	
+
+extern int ReaderSSE2 (void *ptr, unsigned long, unsigned long);
+extern int RandomReaderSSE2 (unsigned long **ptr, unsigned long, unsigned long);
+
+extern int WriterSSE2 (void *ptr, unsigned long, unsigned long, unsigned long);
+extern int RandomWriterSSE2(unsigned long **ptr, unsigned long, unsigned long, unsigned long);
+
+extern int WriterSSE2_bypass (void *ptr, unsigned long, unsigned long, unsigned long);
+extern int RandomWriterSSE2_bypass (unsigned long **ptr, unsigned long, unsigned long, unsigned long);
+
+extern int has_sse2 ();
+#endif
+
+#define FBLOOPS_R 400
+#define FBLOOPS_W 800
+#define FB_SIZE (640*480*2)
+
+#endif
+
diff --git a/main.c b/main.c
new file mode 100644
index 0000000..8847a7f
--- /dev/null
+++ b/main.c
@@ -0,0 +1,2245 @@
+/*============================================================================
+  bandwidth 0.24, a benchmark to estimate memory transfer bandwidth.
+  Copyright (C) 2005-2010 by Zack T Smith.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation; either version 2 of the License, or
+  (at your option) any later version.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software
+  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+
+  The author may be reached at fbui@comcast.net.
+ *===========================================================================*/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <sys/param.h>
+#include <sys/types.h>
+#include <sys/time.h>
+#include <sys/stat.h>
+#include <sys/ioctl.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <wchar.h>
+#include <math.h>
+
+#include <netdb.h> // gethostbyname
+#include <sys/socket.h>
+#include <netinet/in.h>
+#include <arpa/inet.h>
+
+#include "defs.h"
+#include "BMP.h"
+#include "config.h"
+
+#ifdef __WIN32__
+#include <windows.h>
+#endif
+
+#ifdef __linux__
+#include <linux/fb.h>
+#include <sys/mman.h>
+#endif
+
+#ifdef CONFIG_ARCH_S2L
+#if  defined(CONFIG_BSP_BOARD_S2LM_KIWI) || defined(CONFIG_BSP_BOARD_STRAWBERRY)
+#define DRAM_SIZE_SMALL
+#endif
+#endif
+
+//----------------------------------------
+// Graphing data.
+//
+static char graph_title [500];
+#define TITLE "Results from bandwidth " VERSION " by Zack Smith, http://caladan.tk"
+static BMP *graph;	// Graph of results.
+static int graph_width = 1280;
+static int graph_height = 720;
+static int graph_left_margin = 100;
+static int graph_margin = 50; // top/bottom/right
+static int graph_x_span = 1;
+static int graph_y_span = 1;
+static int graph_last_x = -1;
+static int graph_last_y = -1;
+static unsigned long graph_fg = RGB_BLACK;
+static int legend_y;
+#define MAX_GRAPH_DATA 5000
+static long graph_data [MAX_GRAPH_DATA];
+static int graph_data_index = 0;
+enum {
+	DATUM_SIZE=0,
+	DATUM_AMOUNT=1,
+	DATUM_COLOR=2,
+};
+static int max_bandwidth = 0;	// Always 10 times the # of megabyte/sec.
+
+static bool use_sse2 = true;
+static bool use_sse4 = true;
+
+//----------------------------------------
+// Parameters for the tests.
+//
+static long usec_per_test = 5000000;	// 5 seconds per test.
+
+static int chunk_sizes[] = {
+	256,
+	512,
+	768,
+	1024,
+	2048,
+	3072,
+	4096,
+	6144,
+	8192,	// Some processors' L1 data caches are only 8kB.
+	12288,
+	16384,
+	20480,
+	24576,
+	28672,
+	32768,	// Common L1 data cache size.
+	40960,
+	49152,
+	65536,
+	131072,	// Old L2 cache size.
+	192 * 1024,
+	256 * 1024,	// Old L2 cache size.
+	384 * 1024,
+	512 * 1024,	// Old L2 cache size.
+	768 * 1024,
+	1 << 20,	// 1 MB = common L2 cache size.
+	(1024 + 256) * 1024,	// 1.25
+	(1024 + 512) * 1024,	// 1.5
+	(1024 + 768) * 1024,	// 1.75
+	1 << 21,	// 2 MB = common L2 cache size.
+	(2048 + 256) * 1024,	// 2.25
+	(2048 + 512) * 1024,	// 2.5
+	(2048 + 768) * 1024,	// 2.75
+	3072 * 1024,	// 3 MB = common L2 cache sized.
+	1 << 22,	// 4 MB
+	5242880,	// 5 megs
+	6291456,	// 6 megs (std L2 cache size)
+#if !defined(__arm__) && !defined(__aarch64__)
+	7 * 1024 * 1024,	// 7 megs
+	8 * 1024 * 1024,
+	16 * 1024 * 1024,
+	64 * 1024 * 1024,
+#endif
+	0
+};
+
+//----------------------------------------
+// Under CeGCC, the math.h log2() function
+// turned out to be very inaccurate e.g.
+// log2(8)=1.44, so I have here hard-coded
+// the logarithms.
+//
+static double chunk_sizes_log2[] =
+{
+	8,
+	9,
+	9.585,
+	10,
+	11,
+	11.585,
+	12,
+	12.585,
+	13,		// 8 kB
+	13.585,
+	14,		// 16 kB
+	14.3219,	// 20 kB
+	14.585,		// 24 kB
+	14.8074,	// 28 kB
+	15,		// 32 kB
+	15.3219,	// 40 kB
+	15.585,		// 48 kB
+	16,		// 64 kB
+	17,		// 128 kB
+	17.585,		// 192 kB
+	18,		// 256 kB
+	18.585,		// 385 kB
+	19,		// 512 kB
+	19.585,		// 768 kB
+	20,		// 1 MB
+	20.3219,	// 1.25
+	20.585,		// 1.5
+	20.8074,	// 1.75
+	21,		// 2 MB
+	21.1699,	// 2.25 MB
+	21.3219,	// 2.5 MB
+	21.4594,	// 2.75 MB
+	21.585,		// 3 MB
+	22,		// 4 MB
+	22.3219,
+	22.585,
+#if !defined(__arm__) && !defined(__aarch64__)
+	22.8074,
+	23,
+	24,
+	26,
+#endif
+	0
+};
+
+static int min_chunk_size = 1;	// These are determined in graph_draw_labels().
+static int max_chunk_size = 1;
+
+//----------------------------------------------------------------------------
+// Name:	error
+// Purpose:	Complain and exit.
+//----------------------------------------------------------------------------
+void error (char *s)
+{
+#ifndef __WIN32__
+	fprintf (stderr, "Error: %s\n", s);
+	exit (1);
+#else
+	wchar_t tmp [200];
+	int i;
+	for (i = 0; s[i]; i++)
+		tmp[i] = s[i];
+	tmp[i] = 0;
+	MessageBoxW (0, tmp, L"Error", 0);
+	ExitProcess (0);
+#endif
+}
+
+void
+dump_hex64 (unsigned long long value)
+{
+	unsigned long long v2 = value;
+	int i = 16;
+	while (i--) {
+		unsigned long long tmp = v2 >> 60;
+		unsigned int tmp2 = (unsigned int) tmp;
+		printf ("%1x", tmp2);
+		v2 <<= 4;
+	}
+}
+
+//============================================================================
+// Graphing logic.
+//============================================================================
+
+//----------------------------------------------------------------------------
+// Name:	graph_draw_labels
+// Purpose:	Draw the labels and ticks.
+//----------------------------------------------------------------------------
+void
+graph_draw_labels ()
+{
+	int i;
+
+	//----------------------------------------
+	// Horizontal
+	//
+	//--------------------
+	// Establish min & max.
+	//
+	min_chunk_size = 1000;
+	max_chunk_size = 0;
+	i = 0;
+	int j;
+	while ((j = chunk_sizes_log2 [i])) {
+		if (j < min_chunk_size)
+			min_chunk_size = j;
+		if (j > max_chunk_size)
+			max_chunk_size = j;
+		i++;
+	}
+
+	for (i = min_chunk_size; i <= max_chunk_size; i++) {
+		char str[20];
+		int x = graph_left_margin +
+			((i-min_chunk_size) * graph_x_span) /
+			(max_chunk_size - min_chunk_size);
+		int y = graph_height - graph_margin + 10;
+
+		unsigned long amt = 1 << i;
+		if (amt < 1024)
+			sprintf (str, "%ld B", amt);
+		else if (amt < (1<<20)) {
+			sprintf (str, "%ld kB", amt >> 10);
+		}
+		else {
+			j = amt >> 20;
+			switch ((amt >> 18) & 3) {
+			case 0: sprintf (str, "%d MB", j); break;
+			case 1: sprintf (str, "%d.25 MB", j); break;
+			case 2: sprintf (str, "%d.5 MB", j); break;
+			case 3: sprintf (str, "%d.75 MB", j); break;
+			}
+		}
+
+		BMP_vline (graph, x, y, y-10, RGB_BLACK);
+		BMP_draw_mini_string (graph, str, x - 10, y+8, RGB_BLACK);
+	}
+
+	//----------------------------------------
+	// Vertical
+	//
+	for (i = 0; i <= (max_bandwidth/10000); i++) {
+		char str[20];
+		int x = graph_left_margin - 10;
+		int y = graph_height - graph_margin -
+			(i * graph_y_span) / (max_bandwidth/10000);
+
+		BMP_hline (graph, x, x+10, y, RGB_BLACK);
+
+		sprintf (str, "%d GB/s", i);
+		BMP_draw_mini_string (graph, str,
+			x - 40, y - MINIFONT_HEIGHT/2, RGB_BLACK);
+	}
+}
+
+void
+graph_init ()
+{
+	if (!graph)
+		return;
+
+	BMP_clear (graph, RGB_WHITE);
+
+	BMP_hline (graph, graph_left_margin, graph_width - graph_margin,
+			graph_height - graph_margin, RGB_BLACK);
+	BMP_vline (graph, graph_left_margin, graph_margin,
+			graph_height - graph_margin, RGB_BLACK);
+
+	graph_x_span = graph_width - (graph_margin + graph_left_margin);
+	graph_y_span = graph_height - 2 * graph_margin;
+
+	BMP_draw_mini_string (graph, graph_title,
+		graph_left_margin, graph_margin/2, RGB_BLACK);
+
+	legend_y = graph_margin;
+}
+
+void
+graph_new_line (char *str, unsigned long color)
+{
+	BMP_draw_mini_string (graph, str,
+		graph_width - graph_margin - 200, legend_y, color);
+
+	legend_y += 10;
+
+	graph_fg = color;
+	graph_last_x = graph_last_y = -1;
+
+	if (graph_data_index >= MAX_GRAPH_DATA-2)
+		error ("Too many graph data.");
+
+	graph_data [graph_data_index++] = DATUM_COLOR;
+	graph_data [graph_data_index++] = (long) color;
+}
+
+//----------------------------------------------------------------------------
+// Name:	graph_add_point
+// Purpose:	Adds a point to this list to be drawn.
+//----------------------------------------------------------------------------
+void
+graph_add_point (int size, int amount)
+{
+	if (graph_data_index >= MAX_GRAPH_DATA-4)
+		error ("Too many graph data.");
+
+	graph_data [graph_data_index++] = DATUM_SIZE;
+	graph_data [graph_data_index++] = size;
+	graph_data [graph_data_index++] = DATUM_AMOUNT;
+	graph_data [graph_data_index++] = amount;
+}
+
+//----------------------------------------------------------------------------
+// Name:	graph_plot
+// Purpose:	Plots a point on the current graph.
+//----------------------------------------------------------------------------
+void
+graph_plot (int size, int amount)
+{
+	//----------------------------------------
+	// Get the log2 of the chunk size.
+	// We cannot rely on the libm math.h log2
+	// function, because under CeGCC,
+	// log2(8) = 1.44.
+	//
+	int i = 0;
+	while (chunk_sizes [i] && chunk_sizes [i] != size)
+		i++;
+	if (!chunk_sizes [i])
+		error ("Lookup of chunk size failed.");
+	double tmp = chunk_sizes_log2 [i];
+
+	//----------------------------------------
+	// Plot the point. The x axis is
+	// logarithmic, base 2.
+	//
+	tmp -= (double) min_chunk_size;
+	tmp *= (double) graph_x_span;
+	tmp /= (double) (max_chunk_size - min_chunk_size);
+
+	int x = graph_left_margin + (int) tmp;
+	int y = graph_height - graph_margin -
+		(amount * graph_y_span) / max_bandwidth;
+
+// Really I ought to save all data points, take max of everything, then plot.
+
+	if (graph_last_x != -1 && graph_last_y != -1) {
+		BMP_line (graph, graph_last_x, graph_last_y, x, y, graph_fg);
+	}
+
+	graph_last_x = x;
+	graph_last_y = y;
+}
+
+//----------------------------------------------------------------------------
+// Name:	graph_make
+// Purpose:	Plots all lines.
+//----------------------------------------------------------------------------
+void
+graph_make ()
+{
+	int i;
+
+	//----------------------------------------
+	// Get the maximum bandwidth in order to
+	// properly scale the graph vertically.
+	//
+	max_bandwidth = 0;
+	for (i = 0; i < graph_data_index; i += 2) {
+		if (graph_data[i] == DATUM_AMOUNT) {
+			int amt = graph_data[i+1];
+			if (amt > max_bandwidth)
+				max_bandwidth = amt;
+		}
+	}
+	max_bandwidth /= 10000;
+	max_bandwidth *= 10000;
+	max_bandwidth += 10000;
+
+	graph_draw_labels ();
+
+	//----------------------------------------
+	// OK, now draw the lines.
+	//
+	int size = -1, amt = -1;
+	for (i = 0; i < graph_data_index; i += 2)
+	{
+		int type = graph_data[i];
+		long value = graph_data[i+1];
+
+		switch (type) {
+		case DATUM_AMOUNT:	amt = value; break;
+		case DATUM_SIZE:	size = value; break;
+		case DATUM_COLOR:
+			graph_fg = (unsigned long) value;
+			graph_last_x = -1;
+			graph_last_y = -1;
+			break;
+		}
+
+		if (amt != -1 && size != -1) {
+			graph_plot (size, amt);
+			amt = size = -1;
+		}
+	}
+}
+
+//============================================================================
+// Output buffer logic.
+//============================================================================
+
+#define MSGLEN 10000
+static wchar_t msg [MSGLEN];
+
+void print (wchar_t *s)
+{
+	wcscat (msg, s);
+}
+
+void newline ()
+{
+	wcscat (msg, L"\n");
+}
+
+void println (wchar_t *s)
+{
+	wcscat (msg, s);
+	newline ();
+}
+
+void print_int (int d)
+{
+#if defined(__WIN32__) && (defined(__arm__) || defined(__aarch64__))
+	swprintf (msg + wcslen (msg), L"%d", d);
+#else
+	swprintf (msg + wcslen (msg), MSGLEN, L"%d", d);
+#endif
+}
+
+void println_int (int d)
+{
+	print_int (d);
+	newline ();
+}
+
+void print_result (long double result)
+{
+#if defined(__WIN32__) && (defined(__arm__) || defined(__aarch64__))
+	swprintf (msg + wcslen (msg), L"%.1Lf MB/s", result);
+#else
+	swprintf (msg + wcslen (msg), MSGLEN, L"%.1Lf MB/s", result);
+#endif
+}
+
+void dump (FILE *f)
+{
+	if (!f)
+		f = stdout;
+
+	int i = 0;
+	while (msg[i]) {
+		char ch = (char) msg[i];
+		fputc (ch, f);
+		i++;
+	}
+
+	msg [0] = 0;
+}
+
+void flush ()
+{
+#if defined(__WIN32__) && (defined(__arm__) || defined(__aarch64__))
+	MessageBeep (MB_OK);
+#else
+	dump (NULL);
+	fflush (stdout);
+#endif
+}
+
+void print_size (unsigned long size)
+{
+	if (size < 1024) {
+		print_int (size);
+		print (L" B");
+	}
+	else if (size < (1<<20)) {
+		print_int (size >> 10);
+		print (L" kB");
+	} else {
+		print_int (size >> 20);
+		switch ((size >> 18) & 3) {
+		case 1: print (L".25"); break;
+		case 2: print (L".5"); break;
+		case 3: print (L".75"); break;
+		}
+		print (L" MB");
+	}
+}
+
+//============================================================================
+// Timing logic.
+//============================================================================
+
+//----------------------------------------------------------------------------
+// Name:	mytime
+// Purpose:	Reports time in microseconds.
+//----------------------------------------------------------------------------
+unsigned long mytime ()
+{
+#ifndef __WIN32__
+	struct timeval tv;
+	struct timezone tz;
+	memset (&tz, 0, sizeof(struct timezone));
+	gettimeofday (&tv, &tz);
+	return 1000000 * tv.tv_sec + tv.tv_usec;
+#else
+	return 1000 * GetTickCount ();	// accurate enough.
+#endif
+}
+
+//----------------------------------------------------------------------------
+// Name:	calculate_result
+// Purpose:	Calculates and prints a result.
+// Returns:	10 times the number of megabytes per second.
+//----------------------------------------------------------------------------
+int
+calculate_result (unsigned long chunk_size, long long total_count, long diff)
+{
+	if (!diff)
+		error ("Zero time difference.");
+
+// printf ("\nIn calculate_result, chunk_size=%ld, total_count=%lld, diff=%ld\n", chunk_size, total_count, diff);
+	long double result = (long double) chunk_size;
+	result *= (long double) total_count;
+	result *= 1000000.;
+	result /= 1048576.;
+	result /= (long double) diff;
+
+	print_result (result);
+
+	return (long) (10.0 * result);
+}
+
+//============================================================================
+// Tests.
+//============================================================================
+
+//----------------------------------------------------------------------------
+// Name:	do_write
+// Purpose:	Performs write on chunk of memory of specified size.
+//----------------------------------------------------------------------------
+enum {
+	NO_SSE2,
+	SSE2,
+	SSE2_BYPASS,
+};
+int
+do_write (unsigned long size, int mode, bool random)
+{
+	unsigned char *chunk;
+	unsigned char *chunk0;
+	unsigned long loops;
+	unsigned long long total_count=0;
+#if defined(__x86_64__) || defined(__aarch64__)
+	unsigned long value = 0x1234567689abcdef;
+#else
+	unsigned long value = 0x12345678;
+#endif
+	unsigned long diff=0, t0;
+	unsigned long tmp;
+	unsigned long **chunk_ptrs = NULL;
+
+	if (size & 255)
+		error ("do_write(): chunk size is not multiple of 256.");
+
+	//-------------------------------------------------
+	chunk0 = malloc (size+32);
+	chunk = chunk0;
+	if (!chunk)
+		error ("Out of memory");
+
+	tmp = (unsigned long) chunk;
+	if (tmp & 15) {
+		tmp -= (tmp & 15);
+		tmp += 16;
+		chunk = (unsigned char*) tmp;
+	}
+
+	//----------------------------------------
+	// Set up random pointers to chunks.
+	//
+	if (random) {
+		tmp = size/256;
+		chunk_ptrs = (unsigned long**) malloc (sizeof (unsigned long*) * tmp);
+		if (!chunk_ptrs)
+			error ("Out of memory.");
+
+		//----------------------------------------
+		// Store pointers to all chunks into array.
+		//
+		int i;
+		for (i = 0; i < tmp; i++) {
+			chunk_ptrs [i] = (unsigned long*) (chunk + 256 * i);
+		}
+
+		//----------------------------------------
+		// Randomize the array of chunk pointers.
+		//
+		int k = 100;
+		while (k--) {
+			for (i = 0; i < tmp; i++) {
+				int j = rand() % tmp;
+				if (i != j) {
+					unsigned long *ptr = chunk_ptrs [i];
+					chunk_ptrs [i] = chunk_ptrs [j];
+					chunk_ptrs [j] = ptr;
+				}
+			}
+		}
+	}
+
+	//-------------------------------------------------
+	if (random)
+		print (L"Random write ");
+	else
+		print (L"Sequential write ");
+
+	if (mode == SSE2) {
+		print (L"(128-bit), size = ");
+	}
+	else
+	if (mode == SSE2_BYPASS) {
+		print (L"bypassing cache (128-bit), size = ");
+	} else {
+#if defined(__x86_64__) || defined(__aarch64__)
+		print (L"(64-bit), size = ");
+#else
+		print (L"(32-bit), size = ");
+#endif
+	}
+
+	print_size (size);
+	print (L", ");
+
+	loops = (1 << 26) / size;// XX need to adjust for CPU MHz
+
+	t0 = mytime ();
+
+	while (diff < usec_per_test) {
+		total_count += loops;
+
+#if !defined(__arm__) && !defined(__aarch64__)
+		if (mode == SSE2) {
+			if (random)
+				RandomWriterSSE2 (chunk_ptrs, size/256, loops, value);
+			else
+				WriterSSE2 (chunk, size, loops, value);
+		}
+		else
+		if (mode == SSE2_BYPASS) {
+			if (random)
+				RandomWriterSSE2_bypass (chunk_ptrs, size/256, loops, value);
+			else
+				WriterSSE2_bypass (chunk, size, loops, value);
+		}
+		else
+#endif
+		if (random)
+			RandomWriter (chunk_ptrs, size/256, loops, value);
+		else
+			Writer (chunk, size, loops, value);
+
+		diff = mytime () - t0;
+	}
+
+	print (L"loops = ");
+	print_int (total_count);
+	print (L", ");
+
+	flush ();
+
+	int result = calculate_result (size, total_count, diff);
+	newline ();
+
+	flush ();
+
+	free ((void*)chunk0);
+
+	if (chunk_ptrs)
+		free (chunk_ptrs);
+
+	return result;
+}
+
+
+//----------------------------------------------------------------------------
+// Name:	do_read
+// Purpose:	Performs sequential read on chunk of memory of specified size.
+//----------------------------------------------------------------------------
+int
+do_read (unsigned long size, bool use_sse2, bool random)
+{
+	unsigned long loops;
+	unsigned long long total_count = 0;
+	unsigned long t0, diff=0;
+	unsigned char *chunk;
+	unsigned char *chunk0;
+	unsigned long tmp;
+	unsigned long **chunk_ptrs = NULL;
+
+	if (size & 255)
+		error ("do_read(): chunk size is not multiple of 256.");
+
+	//-------------------------------------------------
+	chunk0 = chunk = malloc (size+32);
+	if (!chunk)
+		error ("Out of memory");
+
+	memset (chunk, 0, size);
+
+	tmp = (unsigned long) chunk;
+	if (tmp & 15) {
+		tmp -= (tmp & 15);
+		tmp += 16;
+		chunk = (unsigned char*) tmp;
+	}
+
+	//----------------------------------------
+	// Set up random pointers to chunks.
+	//
+	if (random) {
+		int tmp = size/256;
+		chunk_ptrs = (unsigned long**) malloc (sizeof (unsigned long*) * tmp);
+		if (!chunk_ptrs)
+			error ("Out of memory.");
+
+		//----------------------------------------
+		// Store pointers to all chunks into array.
+		//
+		int i;
+		for (i = 0; i < tmp; i++) {
+			chunk_ptrs [i] = (unsigned long*) (chunk + 256 * i);
+		}
+
+		//----------------------------------------
+		// Randomize the array of chunk pointers.
+		//
+		int k = 100;
+		while (k--) {
+			for (i = 0; i < tmp; i++) {
+				int j = rand() % tmp;
+				if (i != j) {
+					unsigned long *ptr = chunk_ptrs [i];
+					chunk_ptrs [i] = chunk_ptrs [j];
+					chunk_ptrs [j] = ptr;
+				}
+			}
+		}
+	}
+
+	//-------------------------------------------------
+	if (random)
+		print (L"Random read ");
+	else
+		print (L"Sequential read ");
+
+	if (use_sse2) {
+		print (L"(128-bit), size = ");
+	} else {
+#if defined(__x86_64__) || defined(__aarch64__)
+		print (L"(64-bit), size = ");
+#else
+		print (L"(32-bit), size = ");
+#endif
+	}
+
+	print_size (size);
+	print (L", ");
+
+	flush ();
+
+	loops = (1 << 26) / size;	// XX need to adjust for CPU MHz
+
+	t0 = mytime ();
+
+	while (diff < usec_per_test) {
+		total_count += loops;
+
+#if !defined(__arm__) && !defined(__aarch64__)
+		if (use_sse2) {
+			if (random)
+				RandomReaderSSE2 (chunk_ptrs, size/256, loops);
+			else
+				ReaderSSE2 (chunk, size, loops);
+		}
+		else
+#endif
+		if (random)
+			RandomReader (chunk_ptrs, size/256, loops);
+		else
+			Reader (chunk, size, loops);
+
+		diff = mytime () - t0;
+	}
+
+	print (L"loops = ");
+	print_int (total_count);
+	print (L", ");
+
+	int result = calculate_result (size, total_count, diff);
+	newline ();
+
+	flush ();
+
+	free (chunk0);
+
+	if (chunk_ptrs)
+		free (chunk_ptrs);
+
+	return result;
+}
+
+
+
+//----------------------------------------------------------------------------
+// Name:	do_copy
+// Purpose:	Performs sequential memory copy.
+//----------------------------------------------------------------------------
+int
+do_copy (unsigned long size, int mode)
+{
+	unsigned long loops;
+	unsigned long long total_count = 0;
+	unsigned long t0, diff=0;
+	unsigned char *chunk_src;
+	unsigned char *chunk_dest;
+	unsigned char *chunk_src0;
+	unsigned char *chunk_dest0;
+	unsigned long tmp;
+
+	if (size & 255)
+		error ("do_copy(): chunk size is not multiple of 256.");
+
+	//-------------------------------------------------
+	chunk_src0 = chunk_src = malloc (size+32);
+	if (!chunk_src)
+		error ("Out of memory");
+	chunk_dest0 = chunk_dest = malloc (size+32);
+	if (!chunk_dest)
+		error ("Out of memory");
+
+	memset (chunk_src, 100, size);
+	memset (chunk_dest, 200, size);
+
+	tmp = (unsigned long) chunk_src;
+	if (tmp & 15) {
+		tmp -= (tmp & 15);
+		tmp += 16;
+		chunk_src = (unsigned char*) tmp;
+	}
+	tmp = (unsigned long) chunk_dest;
+	if (tmp & 15) {
+		tmp -= (tmp & 15);
+		tmp += 16;
+		chunk_dest = (unsigned char*) tmp;
+	}
+
+	//-------------------------------------------------
+	print (L"Sequential copy ");
+
+	if (mode == SSE2) {
+		print (L"(128-bit), size = ");
+	}
+	else {
+#if defined(__x86_64__) || defined(__aarch64__)
+		print (L"(64-bit), size = ");
+#else
+		print (L"(32-bit), size = ");
+#endif
+	}
+
+	print_size (size);
+	print (L", ");
+
+	flush ();
+
+	loops = (1 << 26) / size;	// XX need to adjust for CPU MHz
+
+	t0 = mytime ();
+
+	while (diff < usec_per_test) {
+		total_count += loops;
+
+#if !defined(__arm__) && !defined(__aarch64__)
+		if (mode == SSE2)
+			CopySSE (chunk_dest, chunk_src, size, loops);
+#if 0
+		else
+			Copy (chunk_dest, chunk_src, size, loops);
+#endif
+#endif
+
+		diff = mytime () - t0;
+	}
+
+	print (L"loops = ");
+	print_int (total_count);
+	print (L", ");
+
+	int result = calculate_result (size, total_count, diff);
+	newline ();
+
+	flush ();
+
+	free (chunk_src0);
+	free (chunk_dest0);
+
+	return result;
+}
+
+
+//----------------------------------------------------------------------------
+// Name:	fb_readwrite
+// Purpose:	Performs sequential read & write tests on framebuffer memory.
+//----------------------------------------------------------------------------
+#if defined(__linux__) && defined(FBIOGET_FSCREENINFO)
+void
+fb_readwrite (bool use_sse2)
+{
+	//unsigned long counter, total_count;
+  unsigned long total_count;
+	unsigned long length;
+	unsigned long diff, t0;
+	static struct fb_fix_screeninfo fi;
+	static struct fb_var_screeninfo vi;
+	unsigned long *fb = NULL;
+	//unsigned long datum;
+	int fd;
+	//register unsigned long foo;
+#if defined(__x86_64__) || defined(__aarch64__)
+	unsigned long value = 0x1234567689abcdef;
+#else
+	unsigned long value = 0x12345678;
+#endif
+
+	//-------------------------------------------------
+
+	fd = open ("/dev/fb0", O_RDWR);
+	if (fd < 0)
+		fd = open ("/dev/fb/0", O_RDWR);
+	if (fd < 0) {
+		println (L"Cannot open framebuffer device.");
+		return;
+	}
+
+	if (ioctl (fd, FBIOGET_FSCREENINFO, &fi)) {
+		close (fd);
+		println (L"Cannot get framebuffer info");
+		return;
+	}
+	else
+	if (ioctl (fd, FBIOGET_VSCREENINFO, &vi)) {
+		close (fd);
+		println (L"Cannot get framebuffer info");
+		return;
+	}
+	else
+	{
+		if (fi.visual != FB_VISUAL_TRUECOLOR &&
+		    fi.visual != FB_VISUAL_DIRECTCOLOR ) {
+			close (fd);
+			println (L"Need direct/truecolor framebuffer device.");
+			return;
+		} else {
+			unsigned long fblen;
+
+			print (L"Framebuffer resolution: ");
+			print_int (vi.xres);
+			print (L"x");
+			print_int (vi.yres);
+			print (L", ");
+			print_int (vi.bits_per_pixel);
+			println (L" bpp\n");
+
+			fb = (unsigned long*) fi.smem_start;
+			fblen = fi.smem_len;
+
+			fb = mmap (fb, fblen,
+				PROT_WRITE | PROT_READ,
+				MAP_SHARED, fd, 0);
+			if (fb == MAP_FAILED) {
+				close (fd);
+				println (L"Cannot access framebuffer memory.");
+				return;
+			}
+		}
+	}
+
+	//-------------------
+	// Use only the upper half of the display.
+	//
+	length = FB_SIZE;
+
+	//-------------------
+	// READ
+	//
+	print (L"Framebuffer memory sequential read ");
+	flush ();
+
+	t0 = mytime ();
+
+	total_count = FBLOOPS_R;
+
+#if !defined(__arm__) && !defined(__aarch64__)
+	if (use_sse2)
+		ReaderSSE2 (fb, length, FBLOOPS_R);
+	else
+#endif
+		Reader (fb, length, FBLOOPS_R);
+
+	diff = mytime () - t0;
+
+	calculate_result (length, total_count, diff);
+	newline ();
+
+	//-------------------
+	// WRITE
+	//
+	print (L"Framebuffer memory sequential write ");
+	flush ();
+
+	t0 = mytime ();
+
+	total_count = FBLOOPS_W;
+
+#if !defined(__arm__) && !defined(__aarch64__)
+	if (use_sse2)
+		WriterSSE2_bypass (fb, length, FBLOOPS_W, value);
+	else
+#endif
+		Writer (fb, length, FBLOOPS_W, value);
+
+	diff = mytime () - t0;
+
+	calculate_result (length, total_count, diff);
+	newline ();
+}
+#endif
+
+//----------------------------------------------------------------------------
+// Name:	register_test
+// Purpose:	Determines bandwidth of register-to-register transfers.
+//----------------------------------------------------------------------------
+void
+register_test ()
+{
+	long long total_count = 0;
+	unsigned long t0;
+	unsigned long diff = 0;
+
+	//--------------------------------------
+#if defined(__x86_64__) || defined(__aarch64__)
+	print (L"Main register to main register transfers (64-bit) ");
+#else
+	print (L"Main register to main register transfers (32-bit) ");
+#endif
+	flush ();
+#define REGISTER_COUNT 10000
+
+	t0 = mytime ();
+	while (diff < usec_per_test)
+	{
+		RegisterToRegister (REGISTER_COUNT);
+		total_count += REGISTER_COUNT;
+
+		diff = mytime () - t0;
+	}
+
+	calculate_result (256, total_count, diff);
+	newline ();
+	flush ();
+
+#if !defined(__arm__) && !defined(__aarch64__)
+	//--------------------------------------
+#ifdef __x86_64__
+	print (L"Main register to vector register transfers (64-bit) ");
+#else
+	print (L"Main register to vector register transfers (32-bit) ");
+#endif
+	flush ();
+#define VREGISTER_COUNT 3333
+
+	t0 = mytime ();
+	diff = 0;
+	total_count = 0;
+	while (diff < usec_per_test)
+	{
+		RegisterToVector (VREGISTER_COUNT);
+		total_count += VREGISTER_COUNT;
+
+		diff = mytime () - t0;
+	}
+
+	calculate_result (256, total_count, diff);
+	newline ();
+	flush ();
+
+	//--------------------------------------
+#ifdef __x86_64__
+	print (L"Vector register to main register transfers (64-bit) ");
+#else
+	print (L"Vector register to main register transfers (32-bit) ");
+#endif
+	flush ();
+
+	t0 = mytime ();
+	diff = 0;
+	total_count = 0;
+	while (diff < usec_per_test)
+	{
+		VectorToRegister (VREGISTER_COUNT);
+		total_count += VREGISTER_COUNT;
+
+		diff = mytime () - t0;
+	}
+
+	calculate_result (256, total_count, diff);
+	newline ();
+	flush ();
+
+	//--------------------------------------
+	print (L"Vector register to vector register transfers (128-bit) ");
+	flush ();
+
+	t0 = mytime ();
+	diff = 0;
+	total_count = 0;
+	while (diff < usec_per_test)
+	{
+		VectorToVector (VREGISTER_COUNT);
+		total_count += VREGISTER_COUNT;
+
+		diff = mytime () - t0;
+	}
+
+	calculate_result (256, total_count, diff);
+	newline ();
+	flush ();
+
+	//--------------------------------------
+	if (use_sse4) {
+		print (L"Vector 8-bit datum to main register transfers ");
+		flush ();
+
+		t0 = mytime ();
+		diff = 0;
+		total_count = 0;
+		while (diff < usec_per_test)
+		{
+			Vector8ToRegister (VREGISTER_COUNT);
+			total_count += VREGISTER_COUNT;
+
+			diff = mytime () - t0;
+		}
+
+		calculate_result (256, total_count, diff);
+		newline ();
+		flush ();
+	}
+
+	//--------------------------------------
+	print (L"Vector 16-bit datum to main register transfers ");
+	flush ();
+
+	t0 = mytime ();
+	diff = 0;
+	total_count = 0;
+	while (diff < usec_per_test)
+	{
+		Vector16ToRegister (VREGISTER_COUNT);
+		total_count += VREGISTER_COUNT;
+
+		diff = mytime () - t0;
+	}
+
+	calculate_result (256, total_count, diff);
+	newline ();
+	flush ();
+
+	//--------------------------------------
+	if (use_sse4) {
+		print (L"Vector 32-bit datum to main register transfers ");
+		flush ();
+
+		t0 = mytime ();
+		diff = 0;
+		total_count = 0;
+		while (diff < usec_per_test)
+		{
+			Vector32ToRegister (VREGISTER_COUNT);
+			total_count += VREGISTER_COUNT;
+
+			diff = mytime () - t0;
+		}
+
+		calculate_result (256, total_count, diff);
+		newline ();
+		flush ();
+	}
+
+#ifdef __x86_64__
+	//--------------------------------------
+	print (L"Vector 64-bit datum to main register transfers ");
+	flush ();
+
+	t0 = mytime ();
+	diff = 0;
+	total_count = 0;
+	while (diff < usec_per_test)
+	{
+		Vector64ToRegister (VREGISTER_COUNT);
+		total_count += VREGISTER_COUNT;
+
+		diff = mytime () - t0;
+	}
+
+	calculate_result (256, total_count, diff);
+	newline ();
+	flush ();
+#endif
+
+	//--------------------------------------
+	if (use_sse4) {
+		print (L"Main register 8-bit datum to vector register transfers ");
+		flush ();
+
+		t0 = mytime ();
+		diff = 0;
+		total_count = 0;
+		while (diff < usec_per_test)
+		{
+			Register8ToVector (VREGISTER_COUNT);
+			total_count += VREGISTER_COUNT;
+
+			diff = mytime () - t0;
+		}
+
+		calculate_result (256, total_count, diff);
+		newline ();
+		flush ();
+	}
+
+	//--------------------------------------
+	print (L"Main register 16-bit datum to vector register transfers ");
+	flush ();
+
+	t0 = mytime ();
+	diff = 0;
+	total_count = 0;
+	while (diff < usec_per_test)
+	{
+		Register16ToVector (VREGISTER_COUNT);
+		total_count += VREGISTER_COUNT;
+
+		diff = mytime () - t0;
+	}
+
+	calculate_result (256, total_count, diff);
+	newline ();
+	flush ();
+
+	//--------------------------------------
+	if (use_sse4) {
+		print (L"Main register 32-bit datum to vector register transfers ");
+		flush ();
+
+		t0 = mytime ();
+		diff = 0;
+		total_count = 0;
+		while (diff < usec_per_test)
+		{
+			Register32ToVector (VREGISTER_COUNT);
+			total_count += VREGISTER_COUNT;
+
+			diff = mytime () - t0;
+		}
+
+		calculate_result (256, total_count, diff);
+		newline ();
+		flush ();
+	}
+
+#ifdef __x86_64__
+	//--------------------------------------
+	print (L"Main register 64-bit datum to vector register transfers ");
+	flush ();
+
+	t0 = mytime ();
+	diff = 0;
+	total_count = 0;
+	while (diff < usec_per_test)
+	{
+		Register64ToVector (VREGISTER_COUNT);
+		total_count += VREGISTER_COUNT;
+
+		diff = mytime () - t0;
+	}
+
+	calculate_result (256, total_count, diff);
+	newline ();
+	flush ();
+#endif
+#endif
+}
+
+//----------------------------------------------------------------------------
+// Name:	stack_test
+// Purpose:	Determines bandwidth of stack-to/from-register transfers.
+//----------------------------------------------------------------------------
+void
+stack_test ()
+{
+	long long total_count = 0;
+	unsigned long t0;
+	unsigned long diff = 0;
+
+#if defined(__x86_64__) || defined(__aarch64__)
+	print (L"Stack-to-register transfers (64-bit) ");
+#else
+	print (L"Stack-to-register transfers (32-bit) ");
+#endif
+	flush ();
+
+	//--------------------------------------
+	diff = 0;
+	total_count = 0;
+	t0 = mytime ();
+	while (diff < usec_per_test)
+	{
+		StackReader (REGISTER_COUNT);
+		total_count += REGISTER_COUNT;
+
+		diff = mytime () - t0;
+	}
+
+	calculate_result (256, total_count, diff);
+	newline ();
+	flush ();
+
+#if defined(__x86_64__) || defined(__aarch64__)
+	print (L"Register-to-stack transfers (64-bit) ");
+#else
+	print (L"Register-to-stack transfers (32-bit) ");
+#endif
+	flush ();
+
+	//--------------------------------------
+	diff = 0;
+	total_count = 0;
+	t0 = mytime ();
+	while (diff < usec_per_test)
+	{
+		StackWriter (REGISTER_COUNT);
+		total_count += REGISTER_COUNT;
+
+		diff = mytime () - t0;
+	}
+
+	calculate_result (256, total_count, diff);
+	newline ();
+	flush ();
+}
+
+//----------------------------------------------------------------------------
+// Name:	library_test
+// Purpose:	Performs C library tests (memset, memcpy).
+//----------------------------------------------------------------------------
+void
+library_test ()
+{
+	char *a1, *a2;
+	unsigned long t, t0;
+	int i;
+
+
+#if defined(__WIN32__) && (defined(__arm__) || defined(__aarch64__))
+	#define NT_SIZE (1024*1024)
+	#define NT_SIZE2 (50)
+#elif !defined(__WIN32__) && (defined(__arm__) || defined(__aarch64__))
+#if defined(DRAM_SIZE_SMALL)
+	#define NT_SIZE (16*1024*1024)
+#else
+	#define NT_SIZE (32*1024*1024)
+#endif
+	#define NT_SIZE2 (50)
+#else
+	#define NT_SIZE (64*1024*1024)
+	#define NT_SIZE2 (100)
+#endif
+
+	a1 = malloc (NT_SIZE);
+	if (!a1)
+		error ("Out of memory");
+
+	a2 = malloc (NT_SIZE);
+	if (!a2)
+		error ("Out of memory");
+
+	//--------------------------------------
+	t0 = mytime ();
+	for (i=0; i<NT_SIZE2; i++) {
+		memset (a1, i, NT_SIZE);
+	}
+	t = mytime ();
+
+	print (L"Library: memset ");
+	calculate_result (NT_SIZE, NT_SIZE2, t-t0);
+	newline ();
+
+	flush ();
+
+	//--------------------------------------
+	t0 = mytime ();
+	for (i=0; i<NT_SIZE2; i++) {
+		memcpy (a2, a1, NT_SIZE);
+	}
+	t = mytime ();
+
+	print (L"Library: memcpy ");
+	calculate_result (NT_SIZE, NT_SIZE2, t-t0);
+	newline ();
+
+	flush ();
+
+	free (a1);
+	free (a2);
+}
+
+//----------------------------------------------------------------------------
+// Name:	network_test_core
+// Purpose:	Performs the network test, talking to and receiving data
+//		back from a transponder node.
+// Note:	Port number specified using server:# notation.
+// Returns:	-1 on error, else the network duration in microseconds.
+//----------------------------------------------------------------------------
+long
+network_test_core (const char *net_path, char *chunk,
+			unsigned long chunk_size,
+			unsigned long count)
+{
+	char hostname [PATH_MAX];
+	char *s;
+	int port = NETWORK_DEFAULT_PORTNUM ;
+	strcpy (hostname, net_path);
+	if ((s = strchr (hostname, ':'))) {
+		*s++ = 0;
+		port = atoi (s);
+	}
+
+	struct hostent* host = gethostbyname (hostname);
+	if (!host)
+		return -1;
+
+	char *host_ip = inet_ntoa (*(struct in_addr *)*host->h_addr_list);
+	int sock = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP);
+
+	struct sockaddr_in addr;
+	addr.sin_family = AF_INET;
+	addr.sin_addr.s_addr = inet_addr(host_ip);
+	addr.sin_port = htons(port);
+
+	if (connect (sock, (struct sockaddr*) &addr, sizeof (struct sockaddr)))
+	{
+		// perror ("connect");
+		close (sock);
+		return -1;
+	}
+
+	//------------------------------------
+	// Send all of our data.
+	//
+	unsigned long t0 = mytime ();
+	int i;
+	for (i = 0; i < count; i++)
+		send (sock, chunk, chunk_size, 0);
+
+#if 0
+	//------------------------------------
+	// Set nonblocking mode.
+	//
+	int opt = 1;
+	ioctl (sock, FIONBIO, &opt);
+#endif
+
+	//------------------------------------
+	// Read the response.
+	//
+	char *buffer = malloc (chunk_size);
+	if (!buffer) {
+		close (sock);
+		// perror ("malloc");
+		return -1;
+	}
+	int amount = recv (sock, buffer, chunk_size, 0);
+	if (amount <= 0) {
+		close (sock);
+		//perror ("recv");
+		return -1;
+	}
+
+	long t = mytime () - t0;
+	close (sock);
+	free (buffer);
+	return t;
+}
+
+//----------------------------------------------------------------------------
+// Name:	ip_to_str
+//----------------------------------------------------------------------------
+void
+ip_to_str (unsigned long addr, char *str)
+{
+	if (!str)
+		return;
+
+	unsigned short a = 0xff & addr;
+	unsigned short b = 0xff & (addr >> 8);
+	unsigned short c = 0xff & (addr >> 16);
+	unsigned short d = 0xff & (addr >> 24);
+	sprintf (str, "%u.%u.%u.%u", a,b,c,d);
+}
+
+//----------------------------------------------------------------------------
+// Name:	network_transponder
+// Purpose:	Act as a transponder, receiving chunks of data and sending
+//		back an acknowledgement once the enture chunk is read.
+// Returns:	False if a problem occurs setting up the network socket.
+//----------------------------------------------------------------------------
+bool
+network_transponder ()
+{
+	struct sockaddr_in sin, from;
+
+	//------------------------------
+	// Get listening socket for port.
+	// Then listen on given port#.
+	//
+	sin.sin_family = AF_INET;
+	sin.sin_addr.s_addr = htonl(INADDR_ANY);
+	sin.sin_port = htons(NETWORK_DEFAULT_PORTNUM);
+	int listensock;
+	if ((listensock = socket (AF_INET, SOCK_STREAM, 0)) < 0)  {
+		perror ("socket");
+		return false;
+	}
+	if (bind (listensock, (struct sockaddr*) &sin, sizeof(sin)) < 0) {
+		perror ("bind");
+		close (listensock);
+		return false;
+	}
+	if (listen (listensock, 500) < 0) {
+		perror ("listen");
+		close (listensock);
+		return false;
+	}
+
+	bool done = false;
+	while (!done) {
+		//----------------------------------------
+		// Wait for a client to contact us.
+		//
+		socklen_t len = sizeof (struct sockaddr);
+		int sock = accept (listensock, (struct sockaddr*) &from, &len);
+		if (sock < 0) {
+			perror ("accept");
+			close (listensock);
+			return false;
+		}
+
+		if (len != sizeof (struct sockaddr_in)) {
+			close (sock);
+			close (listensock);
+			return false;
+		}
+
+#if 0
+		unsigned long ipaddr = from.sin_addr.s_addr;
+		char ipstring[30];
+		ip_to_str (ipaddr, ipstring);
+		fprintf (stderr, "Incoming connection from %s\n", ipstring);
+#endif
+
+		char chunk [NETWORK_CHUNK_SIZE+1];
+		long n_chunks = 0;
+		int amount_read = read (sock, chunk, NETWORK_CHUNK_SIZE);
+		chunk [amount_read] = 0;
+		if (1 != sscanf (chunk, "%ld", &n_chunks)) {
+			close (sock);
+			close (listensock);
+			return false;
+		}
+
+		//----------------------------------------
+		// If the leader sends us a chunk count of
+		// -99, this indicates that we should exit.
+		//
+		if (n_chunks == -99) {
+			close (sock);
+			close (listensock);
+			return true;
+		}
+
+//		printf ("Reading %lu chunks of %d bytes...\n", n_chunks, NETWORK_CHUNK_SIZE);
+
+		unsigned long long remaining = n_chunks;
+		remaining *= NETWORK_CHUNK_SIZE;
+
+//		printf ("remaining="); dump_hex64(remaining); puts("");
+
+		remaining -= amount_read;
+		while (remaining > 0) {
+			amount_read = read (sock, chunk, NETWORK_CHUNK_SIZE);
+			remaining -= amount_read;
+
+			if (amount_read < 0) {
+				perror ("read");
+				break;
+			} else
+			if (!amount_read)
+				break;
+		}
+
+		char *foo = "OK.\n\n";
+		write (sock, foo, 4);
+		close (sock);
+	}
+
+	return true;
+}
+
+//----------------------------------------------------------------------------
+// Name:	network_test
+//----------------------------------------------------------------------------
+bool
+network_test (char **destinations, int n_destinations)
+{
+	int i;
+
+	//----------------------------------------
+	// The memory chunk starts with a 12-byte
+	// length of the overall send size.
+	// The memory chunk will have a list of
+	// the destinations in it.
+	// In future, there will be a mechanism
+	// for testing bandwidth between all nodes,
+	// not just the leader & each of the
+	// transponders.
+	//
+	char chunk [NETWORK_CHUNK_SIZE];
+	memset (chunk, 0, NETWORK_CHUNK_SIZE);
+	sprintf (chunk, "000000000000\n%d\n", n_destinations);
+	for (i = 0; i < n_destinations; i++) {
+		char *s = destinations [i];
+		int chunk_len = strlen (chunk);
+		int len = strlen (s);
+		if (len + chunk_len < NETWORK_CHUNK_SIZE-1) {
+			//----------------------------------------
+			// "transp" indicates that the given node
+			// has not yet been a leader.
+			// In future, "done" will indicate it has.
+			//
+			sprintf (chunk + chunk_len, "%s %s\n", s, "transp");
+		}
+	}
+
+	//----------------------------------------
+	// For each destination, run the test.
+	//
+	for (i = 0; i < n_destinations; i++) {
+		int j = 0;
+		bool problem = false;
+
+		char *hostname = destinations[i];
+		printf ("Bandwidth sending to %s:\n", hostname);
+
+		//----------------------------------------
+		// Send from 8kB up to 32 MB of data.
+		//
+		while (!problem && j < 13) {
+			unsigned long chunk_count = 1 << j;
+			unsigned long long amt_to_send = chunk_count;
+			amt_to_send *= NETWORK_CHUNK_SIZE;
+
+			if (!amt_to_send) // unlikely
+				break;
+
+			//----------------------------------------
+			// Write the overall send size into the
+			// 1st line of the chunk so that the
+			// transponder knows how large the send
+			// is without guessing.
+			//
+			sprintf (chunk, "%11lu", chunk_count);
+			chunk[11] = ' ';
+
+			//--------------------
+			// Send the data.
+			//
+			long duration = network_test_core (hostname,
+				chunk, NETWORK_CHUNK_SIZE, chunk_count);
+			if (duration == -1) {
+				problem = true;
+				fprintf (stderr, "\nCan't connect to %s\n", hostname);
+			} else {
+				unsigned long amt_in_kb = amt_to_send / 1024;
+				unsigned long amt_in_mb = amt_to_send / 1048576;
+				if (!amt_in_mb) {
+					printf ("\tSent %lu kB...", amt_in_kb);
+				} else {
+					printf ("\tSent %lu MB...", amt_in_mb);
+				}
+
+				//------------------------------
+				// Calculate rate in MB/sec.
+				//
+				// Get total # bytes.
+				unsigned long long tmp = NETWORK_CHUNK_SIZE;
+				tmp *= chunk_count;
+
+				// Get total bytes per second.
+				tmp *= 1000000;
+				tmp /= duration;
+
+				// Bytes to megabytes.
+				tmp /= 1000;
+				tmp /= 10;
+				unsigned long whole = tmp / 100;
+				unsigned long frac = tmp % 100;
+				printf ("%lu.%02lu MB/second\n", whole, frac);
+			}
+			j++;
+		}
+
+		puts ("");
+	}
+
+	return true;
+}
+
+//----------------------------------------------------------------------------
+// Name:	usage
+//----------------------------------------------------------------------------
+void
+usage ()
+{
+	printf ("Usage for memory tests: bandwidth [--slow] [--title string]\n");
+	printf ("Usage for starting network tests: bandwidth --network <ipaddr1> [<ipaddr2...]\n");
+	printf ("Usage for receiving network tests: bandwidth --transponder\n");
+	exit (0);
+}
+
+//----------------------------------------------------------------------------
+// Name:	main
+//----------------------------------------------------------------------------
+int
+main (int argc, char **argv)
+{
+	int i, chunk_size;
+
+	--argc;
+	++argv;
+
+	strcpy (graph_title, TITLE);
+
+	bool network_mode = false;
+	bool network_leader = false; // false => transponder
+	int network_destinations_size = 0;
+	int n_network_destinations = 0;
+	char **network_destinations = NULL;
+
+	i = 0;
+	while (i < argc) {
+		char *s = argv [i++];
+		if (!strcmp ("--network", s)) {
+			network_mode = true;
+			network_leader = true;
+			network_destinations_size = 20;
+			network_destinations = (char**) malloc (network_destinations_size * sizeof (char*));
+		}
+		else
+		if (!strcmp ("--transponder", s)) {
+			network_mode = true;
+		}
+		else
+		if (!strcmp ("--slow", s)) {
+			usec_per_test=20000000;	// 20 seconds per test.
+		}
+		else
+		if (!strcmp ("--quick", s)) {
+			usec_per_test = 250000;	// 0.25 seconds per test.
+		}
+		else
+		if (!strcmp ("--nosse2", s)) {
+			use_sse2 = false;
+			use_sse4 = false;
+		}
+		else
+		if (!strcmp ("--nosse4", s)) {
+			use_sse4 = false;
+		}
+		else
+		if (!strcmp ("--help", s)) {
+			usage ();
+		}
+		else
+		if (!strcmp ("--title", s) && i != argc) {
+			sprintf (graph_title, "%s -- %s", TITLE, argv[i++]);
+		}
+		else {
+			if (!network_mode || !network_leader)
+				usage ();
+
+			if ('-' == *s)
+				usage ();
+
+			if (n_network_destinations >= network_destinations_size) {
+				network_destinations_size *= 2;
+				int newsize = sizeof(char*) * network_destinations_size;
+				network_destinations = realloc (network_destinations,
+					newsize);
+			}
+
+			network_destinations [n_network_destinations++] = strdup (s);
+		}
+	}
+
+	msg[0] = 0;
+
+#if !(defined(__WIN32__) && (defined(__arm__) || defined(__aarch64__)))
+	printf ("This is bandwidth version %s.\n", VERSION);
+	printf ("Copyright (C) 2005-2010 by Zack T Smith.\n\n");
+	printf ("This software is covered by the GNU Public License.\n");
+	printf ("It is provided AS-IS, use at your own risk.\n");
+	printf ("See the file COPYING for more information.\n\n");
+	fflush (stdout);
+#else
+	println (L"(C) 2010 by Zack Smith");
+	println (L"Under GNU Public License");
+	println (L"Use at your own risk.");
+#endif
+
+	//----------------------------------------
+	// If network mode selected, enter it now.
+	// Currently cannot combine memory tests
+	// & network tests.
+	//
+	if (network_mode) {
+		if (network_leader) {
+			network_test (network_destinations, n_network_destinations);
+		} else {
+			network_transponder ();
+		}
+
+		puts ("Done.");
+		return 0;
+	}
+
+#if !defined(__arm__) && !defined(__aarch64__)
+	if (!has_sse2 ()) {
+		puts ("Processor does not have SSE2.");
+		use_sse2 = false;
+		use_sse4 = false;
+	}
+
+#ifdef __x86_64__
+	if (use_sse2)
+		println (L"Using 128-bit and 64-bit data transfers.");
+	else
+		println (L"Using 64-bit data transfers.");
+#else
+	if (use_sse2)
+		println (L"Using 128-bit and 32-bit data transfers.");
+	else
+		println (L"Using 32-bit data transfers.");
+#endif
+
+#else
+
+#if defined(__aarch64__)
+	println (L"Using 64-bit transfers.");
+#else
+	println (L"Using 32-bit transfers.");
+#endif
+
+	use_sse2 = false;
+#endif
+
+	println (L"Notation: kB = 1024 B, MB = 1048576 B.");
+
+	flush ();
+
+	//------------------------------------------------------------
+	// Attempt to obtain information about the CPU.
+	//
+#ifdef __linux__
+	struct stat st;
+	if (!stat ("/proc/cpuinfo", &st)) {
+#define TMPFILE "/tmp/bandw_tmp"
+		unlink (TMPFILE);
+		if (-1 == system ("grep MHz /proc/cpuinfo | uniq | sed \"s/[\\t\\n: a-zA-Z]//g\" > "TMPFILE))
+			perror ("system");
+
+		FILE *f = fopen (TMPFILE, "r");
+		if (f) {
+			float cpu_speed = 0.0;
+
+			if (1 == fscanf (f, "%g", &cpu_speed)) {
+				puts ("");
+				printf ("CPU speed is %g MHz.\n", cpu_speed);
+			}
+			fclose (f);
+		}
+
+#if !defined(__arm__) && !defined(__aarch64__)
+		unlink (TMPFILE);
+		if (-1 == system ("grep -i sse4 /proc/cpuinfo > "TMPFILE))
+			perror ("system");
+
+		if (!stat (TMPFILE, &st)) {
+			if (st.st_size < 2) {
+				use_sse4 = false;
+				puts ("Processor lacks SSE4.");
+			}
+		}
+
+		if (!use_sse2) {
+			unlink (TMPFILE);
+			if (-1 == system ("grep -i sse2 /proc/cpuinfo > "TMPFILE))
+				perror ("system");
+
+			if (!stat (TMPFILE, &st)) {
+				if (st.st_size < 2) {
+					use_sse2 = false;
+					puts ("Processor lacks SSE2.");
+				}
+			}
+		}
+#endif
+	} else {
+		printf ("CPU information is not available (/proc/cpuinfo).\n");
+	}
+	fflush (stdout);
+#endif
+
+	graph = BMP_new (graph_width, graph_height);
+	graph_init ();
+
+#if !defined(__arm__) && !defined(__aarch64__)
+	//------------------------------------------------------------
+	// SSE2 sequential reads.
+	//
+	if (use_sse2) {
+		graph_new_line ("Sequential 128-bit reads", RGB_RED);
+
+		newline ();
+
+		i = 0;
+		while ((chunk_size = chunk_sizes [i++])) {
+			int amount = do_read (chunk_size, true, false);
+
+			graph_add_point (chunk_size, amount);
+		}
+	}
+
+	//------------------------------------------------------------
+	// SSE2 random reads.
+	//
+	if (use_sse2) {
+		graph_new_line ("Random 128-bit reads", RGB_MAROON);
+
+		newline ();
+		srand (time (NULL));
+
+		i = 0;
+		while ((chunk_size = chunk_sizes [i++])) {
+			int amount = do_read (chunk_size, true, true);
+
+			graph_add_point (chunk_size, amount);
+		}
+	}
+
+	//------------------------------------------------------------
+	// SSE2 sequential writes that do not bypass the caches.
+	//
+	if (use_sse2) {
+		graph_new_line ("Sequential 128-bit cache writes", RGB_PURPLE);
+
+		newline ();
+
+		i = 0;
+		while ((chunk_size = chunk_sizes [i++])) {
+			int amount = do_write (chunk_size, SSE2, false);
+
+			graph_add_point (chunk_size, amount);
+		}
+	}
+
+	//------------------------------------------------------------
+	// SSE2 random writes that do not bypass the caches.
+	//
+	if (use_sse2) {
+		graph_new_line ("Random 128-bit cache writes", RGB_NAVYBLUE);
+
+		newline ();
+		srand (time (NULL));
+
+		i = 0;
+		while ((chunk_size = chunk_sizes [i++])) {
+			int amount = do_write (chunk_size, SSE2, true);
+
+			graph_add_point (chunk_size, amount);
+		}
+	}
+
+	//------------------------------------------------------------
+	// SSE2 sequential writes that do bypass the caches.
+	//
+	if (use_sse2) {
+		graph_new_line ("Sequential 128-bit bypassing writes", RGB_DARKORANGE);
+
+		newline ();
+
+		i = 0;
+		while ((chunk_size = chunk_sizes [i++])) {
+			int amount = do_write (chunk_size, SSE2_BYPASS, false);
+
+			graph_add_point (chunk_size, amount);
+		}
+	}
+
+	//------------------------------------------------------------
+	// SSE2 random writes that bypass the caches.
+	//
+	if (use_sse2) {
+		graph_new_line ("Random 128-bit bypassing writes", RGB_LEMONYELLOW);
+
+		newline ();
+		srand (time (NULL));
+
+		i = 0;
+		while ((chunk_size = chunk_sizes [i++])) {
+			int amount = do_write (chunk_size, SSE2_BYPASS, true);
+
+			graph_add_point (chunk_size, amount);
+		}
+	}
+#endif
+
+	//------------------------------------------------------------
+	// Sequential non-SSE2 reads.
+	//
+	newline ();
+#if defined(__x86_64__) || defined(__aarch64__)
+	graph_new_line ("Sequential 64-bit reads", RGB_BLUE);
+#else
+	graph_new_line ("Sequential 32-bit reads", RGB_BLUE);
+#endif
+
+	i = 0;
+	while ((chunk_size = chunk_sizes [i++])) {
+		int amount = do_read (chunk_size, false, false);
+
+		graph_add_point (chunk_size, amount);
+	}
+
+	//------------------------------------------------------------
+	// Random non-SSE2 reads.
+	//
+	newline ();
+#if defined(__x86_64__) || defined(__aarch64__)
+	graph_new_line ("Random 64-bit reads", RGB_CYAN);
+#else
+	graph_new_line ("Random 32-bit reads", RGB_CYAN);
+#endif
+	srand (time (NULL));
+
+	i = 0;
+	while ((chunk_size = chunk_sizes [i++])) {
+		int amount = do_read (chunk_size, false, true);
+
+		graph_add_point (chunk_size, amount);
+	}
+
+	//------------------------------------------------------------
+	// Sequential non-SSE2 writes.
+	//
+#if defined(__x86_64__) || defined(__aarch64__)
+	graph_new_line ("Sequential 64-bit writes", RGB_DARKGREEN);
+#else
+	graph_new_line ("Sequential 32-bit writes", RGB_DARKGREEN);
+#endif
+
+	newline ();
+
+	i = 0;
+	while ((chunk_size = chunk_sizes [i++])) {
+		int amount = do_write (chunk_size, NO_SSE2, false);
+
+		graph_add_point (chunk_size, amount);
+	}
+
+	//------------------------------------------------------------
+	// Random non-SSE2 writes.
+	//
+#if defined(__x86_64__) || defined(__aarch64__)
+	graph_new_line ("Random 64-bit writes", RGB_GREEN);
+#else
+	graph_new_line ("Random 32-bit writes", RGB_GREEN);
+#endif
+
+	newline ();
+	srand (time (NULL));
+
+	i = 0;
+	while ((chunk_size = chunk_sizes [i++])) {
+		int amount = do_write (chunk_size, NO_SSE2, true);
+
+		graph_add_point (chunk_size, amount);
+	}
+
+#if !defined(__arm__) && !defined(__aarch64__)
+	//------------------------------------------------------------
+	// SSE2 sequential copy.
+	//
+	if (use_sse2) {
+		graph_new_line ("Sequential 128-bit copy", 0x8f8844);
+
+		newline ();
+
+		i = 0;
+		while ((chunk_size = chunk_sizes [i++])) {
+			int amount = do_copy (chunk_size, SSE2);
+
+			graph_add_point (chunk_size, amount);
+		}
+	}
+#endif
+
+	//------------------------------------------------------------
+	// Register to register.
+	//
+	newline ();
+	register_test ();
+
+	//------------------------------------------------------------
+	// Stack to/from register.
+	//
+	newline ();
+	stack_test ();
+
+	//------------------------------------------------------------
+	// C library performance.
+	//
+	newline ();
+	library_test ();
+
+	//------------------------------------------------------------
+	// Framebuffer read & write.
+	//
+#if defined(__linux__) && defined(FBIOGET_FSCREENINFO)
+	newline ();
+	fb_readwrite (true);
+#endif
+
+#if defined(__WIN32__) && (defined(__arm__) || defined(__aarch64__))
+	MessageBoxW (0, msg, APPNAME, 0);
+
+	FILE *of = fopen ("bandwidth.log", "w");
+	if (of) {
+		dump (of);
+		fclose (of);
+	}
+#else
+	flush ();
+#endif
+
+	graph_make ();
+
+	BMP_write (graph, "bandwidth.bmp");
+	BMP_delete (graph);
+#if defined(__linux__) || defined(__CYGWIN__) || defined(__APPLE__)
+	puts ("\nWrote graph to bandwidth.bmp.");
+	puts ("");
+	puts ("Done.");
+#endif
+
+	return 0;
+}
diff --git a/main_thread.c b/main_thread.c
new file mode 100644
index 0000000..99e6078
--- /dev/null
+++ b/main_thread.c
@@ -0,0 +1,2379 @@
+/*============================================================================
+  bandwidth 0.24, a benchmark to estimate memory transfer bandwidth.
+  Copyright (C) 2005-2010 by Zack T Smith.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation; either version 2 of the License, or
+  (at your option) any later version.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software
+  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+
+  The author may be reached at fbui@comcast.net.
+ *===========================================================================*/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <sys/param.h>
+#include <sys/types.h>
+#include <sys/time.h>
+#include <sys/stat.h>
+#include <sys/ioctl.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <wchar.h>
+#include <math.h>
+#include <pthread.h>
+
+#include <netdb.h> // gethostbyname
+#include <sys/socket.h>
+#include <netinet/in.h>
+#include <arpa/inet.h>
+
+#include "defs.h"
+#include "BMP.h"
+#include "config.h"
+
+#ifdef __WIN32__
+#include <windows.h>
+#endif
+
+#ifdef __linux__
+#include <linux/fb.h>
+#include <sys/mman.h>
+#endif
+
+#ifdef CONFIG_ARCH_S2L
+#if  defined(CONFIG_BSP_BOARD_S2LM_KIWI) || defined(CONFIG_BSP_BOARD_STRAWBERRY)
+#define DRAM_SIZE_SMALL
+#endif
+#endif
+
+//----------------------------------------
+// Graphing data.
+//
+static char graph_title [500];
+#define TITLE "Results from bandwidth " VERSION " by Zack Smith, http://caladan.tk"
+static BMP *graph;	// Graph of results.
+static int graph_width = 1280;
+static int graph_height = 720;
+static int graph_left_margin = 100;
+static int graph_margin = 50; // top/bottom/right
+static int graph_x_span = 1;
+static int graph_y_span = 1;
+static int graph_last_x = -1;
+static int graph_last_y = -1;
+static unsigned long graph_fg = RGB_BLACK;
+static int legend_y;
+#define MAX_GRAPH_DATA 5000
+static long graph_data [MAX_GRAPH_DATA];
+static int graph_data_index = 0;
+enum {
+	DATUM_SIZE=0,
+	DATUM_AMOUNT=1,
+	DATUM_COLOR=2,
+};
+static int max_bandwidth = 0;	// Always 10 times the # of megabyte/sec.
+
+static bool use_sse2 = true;
+static bool use_sse4 = true;
+
+static int goon_flag = 1;
+static int thread_num = 4;
+static int chunk_index = 0;
+static int cpu_num = 0;
+
+struct thread_params {
+	int id;
+	unsigned long size;
+	bool random;
+	unsigned long **chunk_ptrs;
+	unsigned char *chunk;
+	unsigned long loops;
+};
+
+//----------------------------------------
+// Parameters for the tests.
+//
+static long usec_per_test = 5000000;	// 5 seconds per test.
+
+static int chunk_sizes[] = {
+	256,
+	512,
+	768,
+	1024,
+	2048,
+	3072,
+	4096,
+	6144,
+	8192,	// Some processors' L1 data caches are only 8kB.
+	12288,
+	16384,
+	20480,
+	24576,
+	28672,
+	32768,	// Common L1 data cache size.
+	40960,
+	49152,
+	65536,
+	131072,	// Old L2 cache size.
+	192 * 1024,
+	256 * 1024,	// Old L2 cache size.
+	384 * 1024,
+	512 * 1024,	// Old L2 cache size.
+	768 * 1024,
+	1 << 20,	// 1 MB = common L2 cache size.
+	(1024 + 256) * 1024,	// 1.25
+	(1024 + 512) * 1024,	// 1.5
+	(1024 + 768) * 1024,	// 1.75
+	1 << 21,	// 2 MB = common L2 cache size.
+	(2048 + 256) * 1024,	// 2.25
+	(2048 + 512) * 1024,	// 2.5
+	(2048 + 768) * 1024,	// 2.75
+	3072 * 1024,	// 3 MB = common L2 cache sized.
+	1 << 22,	// 4 MB
+	5242880,	// 5 megs
+	6291456,	// 6 megs (std L2 cache size)
+	16 * 1024 * 1024,
+	64 * 1024 * 1024,
+	0
+};
+
+//----------------------------------------
+// Under CeGCC, the math.h log2() function
+// turned out to be very inaccurate e.g.
+// log2(8)=1.44, so I have here hard-coded
+// the logarithms.
+//
+static double chunk_sizes_log2[] =
+{
+	8,
+	9,
+	9.585,
+	10,
+	11,
+	11.585,
+	12,
+	12.585,
+	13,		// 8 kB
+	13.585,
+	14,		// 16 kB
+	14.3219,	// 20 kB
+	14.585,		// 24 kB
+	14.8074,	// 28 kB
+	15,		// 32 kB
+	15.3219,	// 40 kB
+	15.585,		// 48 kB
+	16,		// 64 kB
+	17,		// 128 kB
+	17.585,		// 192 kB
+	18,		// 256 kB
+	18.585,		// 385 kB
+	19,		// 512 kB
+	19.585,		// 768 kB
+	20,		// 1 MB
+	20.3219,	// 1.25
+	20.585,		// 1.5
+	20.8074,	// 1.75
+	21,		// 2 MB
+	21.1699,	// 2.25 MB
+	21.3219,	// 2.5 MB
+	21.4594,	// 2.75 MB
+	21.585,		// 3 MB
+	22,		// 4 MB
+	22.3219,
+	22.585,
+	24,
+	26,
+	0
+};
+
+static int min_chunk_size = 1;	// These are determined in graph_draw_labels().
+static int max_chunk_size = 1;
+
+//----------------------------------------------------------------------------
+// Name:	error
+// Purpose:	Complain and exit.
+//----------------------------------------------------------------------------
+void error (char *s)
+{
+#ifndef __WIN32__
+	fprintf (stderr, "Error: %s\n", s);
+	exit (1);
+#else
+	wchar_t tmp [200];
+	int i;
+	for (i = 0; s[i]; i++)
+		tmp[i] = s[i];
+	tmp[i] = 0;
+	MessageBoxW (0, tmp, L"Error", 0);
+	ExitProcess (0);
+#endif
+}
+
+void
+dump_hex64 (unsigned long long value)
+{
+	unsigned long long v2 = value;
+	int i = 16;
+	while (i--) {
+		unsigned long long tmp = v2 >> 60;
+		unsigned int tmp2 = (unsigned int) tmp;
+		printf ("%1x", tmp2);
+		v2 <<= 4;
+	}
+}
+
+//============================================================================
+// Graphing logic.
+//============================================================================
+
+//----------------------------------------------------------------------------
+// Name:	graph_draw_labels
+// Purpose:	Draw the labels and ticks.
+//----------------------------------------------------------------------------
+void
+graph_draw_labels ()
+{
+	int i;
+
+	//----------------------------------------
+	// Horizontal
+	//
+	//--------------------
+	// Establish min & max.
+	//
+	min_chunk_size = 1000;
+	max_chunk_size = 0;
+	i = 0;
+	int j;
+	while ((j = chunk_sizes_log2 [i])) {
+		if (j < min_chunk_size)
+			min_chunk_size = j;
+		if (j > max_chunk_size)
+			max_chunk_size = j;
+		i++;
+	}
+
+	for (i = min_chunk_size; i <= max_chunk_size; i++) {
+		char str[20];
+		int x = graph_left_margin +
+			((i-min_chunk_size) * graph_x_span) /
+			(max_chunk_size - min_chunk_size);
+		int y = graph_height - graph_margin + 10;
+
+		unsigned long amt = 1 << i;
+		if (amt < 1024)
+			sprintf (str, "%ld B", amt);
+		else if (amt < (1<<20)) {
+			sprintf (str, "%ld kB", amt >> 10);
+		}
+		else {
+			j = amt >> 20;
+			switch ((amt >> 18) & 3) {
+			case 0: sprintf (str, "%d MB", j); break;
+			case 1: sprintf (str, "%d.25 MB", j); break;
+			case 2: sprintf (str, "%d.5 MB", j); break;
+			case 3: sprintf (str, "%d.75 MB", j); break;
+			}
+		}
+
+		BMP_vline (graph, x, y, y-10, RGB_BLACK);
+		BMP_draw_mini_string (graph, str, x - 10, y+8, RGB_BLACK);
+	}
+
+	//----------------------------------------
+	// Vertical
+	//
+	for (i = 0; i <= (max_bandwidth/10000); i++) {
+		char str[20];
+		int x = graph_left_margin - 10;
+		int y = graph_height - graph_margin -
+			(i * graph_y_span) / (max_bandwidth/10000);
+
+		BMP_hline (graph, x, x+10, y, RGB_BLACK);
+
+		sprintf (str, "%d GB/s", i);
+		BMP_draw_mini_string (graph, str,
+			x - 40, y - MINIFONT_HEIGHT/2, RGB_BLACK);
+	}
+}
+
+void
+graph_init ()
+{
+	if (!graph)
+		return;
+
+	BMP_clear (graph, RGB_WHITE);
+
+	BMP_hline (graph, graph_left_margin, graph_width - graph_margin,
+			graph_height - graph_margin, RGB_BLACK);
+	BMP_vline (graph, graph_left_margin, graph_margin,
+			graph_height - graph_margin, RGB_BLACK);
+
+	graph_x_span = graph_width - (graph_margin + graph_left_margin);
+	graph_y_span = graph_height - 2 * graph_margin;
+
+	BMP_draw_mini_string (graph, graph_title,
+		graph_left_margin, graph_margin/2, RGB_BLACK);
+
+	legend_y = graph_margin;
+}
+
+void
+graph_new_line (char *str, unsigned long color)
+{
+	BMP_draw_mini_string (graph, str,
+		graph_width - graph_margin - 200, legend_y, color);
+
+	legend_y += 10;
+
+	graph_fg = color;
+	graph_last_x = graph_last_y = -1;
+
+	if (graph_data_index >= MAX_GRAPH_DATA-2)
+		error ("Too many graph data.");
+
+	graph_data [graph_data_index++] = DATUM_COLOR;
+	graph_data [graph_data_index++] = (long) color;
+}
+
+//----------------------------------------------------------------------------
+// Name:	graph_add_point
+// Purpose:	Adds a point to this list to be drawn.
+//----------------------------------------------------------------------------
+void
+graph_add_point (int size, int amount)
+{
+	if (graph_data_index >= MAX_GRAPH_DATA-4)
+		error ("Too many graph data.");
+
+	graph_data [graph_data_index++] = DATUM_SIZE;
+	graph_data [graph_data_index++] = size;
+	graph_data [graph_data_index++] = DATUM_AMOUNT;
+	graph_data [graph_data_index++] = amount;
+}
+
+//----------------------------------------------------------------------------
+// Name:	graph_plot
+// Purpose:	Plots a point on the current graph.
+//----------------------------------------------------------------------------
+void
+graph_plot (int size, int amount)
+{
+	//----------------------------------------
+	// Get the log2 of the chunk size.
+	// We cannot rely on the libm math.h log2
+	// function, because under CeGCC,
+	// log2(8) = 1.44.
+	//
+	int i = chunk_index;
+	while (chunk_sizes [i] && chunk_sizes [i] != size)
+		i++;
+	if (!chunk_sizes [i])
+		error ("Lookup of chunk size failed.");
+	double tmp = chunk_sizes_log2 [i];
+
+	//----------------------------------------
+	// Plot the point. The x axis is
+	// logarithmic, base 2.
+	//
+	tmp -= (double) min_chunk_size;
+	tmp *= (double) graph_x_span;
+	tmp /= (double) (max_chunk_size - min_chunk_size);
+
+	int x = graph_left_margin + (int) tmp;
+	int y = graph_height - graph_margin -
+		(amount * graph_y_span) / max_bandwidth;
+
+// Really I ought to save all data points, take max of everything, then plot.
+
+	if (graph_last_x != -1 && graph_last_y != -1) {
+		BMP_line (graph, graph_last_x, graph_last_y, x, y, graph_fg);
+	}
+
+	graph_last_x = x;
+	graph_last_y = y;
+}
+
+//----------------------------------------------------------------------------
+// Name:	graph_make
+// Purpose:	Plots all lines.
+//----------------------------------------------------------------------------
+void
+graph_make ()
+{
+	int i;
+
+	//----------------------------------------
+	// Get the maximum bandwidth in order to
+	// properly scale the graph vertically.
+	//
+	max_bandwidth = 0;
+	for (i = 0; i < graph_data_index; i += 2) {
+		if (graph_data[i] == DATUM_AMOUNT) {
+			int amt = graph_data[i+1];
+			if (amt > max_bandwidth)
+				max_bandwidth = amt;
+		}
+	}
+	max_bandwidth /= 10000;
+	max_bandwidth *= 10000;
+	max_bandwidth += 10000;
+
+	graph_draw_labels ();
+
+	//----------------------------------------
+	// OK, now draw the lines.
+	//
+	int size = -1, amt = -1;
+	for (i = 0; i < graph_data_index; i += 2)
+	{
+		int type = graph_data[i];
+		long value = graph_data[i+1];
+
+		switch (type) {
+		case DATUM_AMOUNT:	amt = value; break;
+		case DATUM_SIZE:	size = value; break;
+		case DATUM_COLOR:
+			graph_fg = (unsigned long) value;
+			graph_last_x = -1;
+			graph_last_y = -1;
+			break;
+		}
+
+		if (amt != -1 && size != -1) {
+			graph_plot (size, amt);
+			amt = size = -1;
+		}
+	}
+}
+
+//============================================================================
+// Output buffer logic.
+//============================================================================
+
+#define MSGLEN 10000
+static wchar_t msg [MSGLEN];
+
+void print (wchar_t *s)
+{
+	wcscat (msg, s);
+}
+
+void newline ()
+{
+	wcscat (msg, L"\n");
+}
+
+void println (wchar_t *s)
+{
+	wcscat (msg, s);
+	newline ();
+}
+
+void print_int (int d)
+{
+#if defined(__WIN32__) && (defined(__arm__) || defined(__aarch64__))
+	swprintf (msg + wcslen (msg), L"%d", d);
+#else
+	swprintf (msg + wcslen (msg), MSGLEN, L"%d", d);
+#endif
+}
+
+void println_int (int d)
+{
+	print_int (d);
+	newline ();
+}
+
+void print_result (long double result)
+{
+#if defined(__WIN32__) && (defined(__arm__) || defined(__aarch64__))
+	swprintf (msg + wcslen (msg), L"%.1Lf MB/s", result);
+#else
+	swprintf (msg + wcslen (msg), MSGLEN, L"%.1Lf MB/s", result);
+#endif
+}
+
+void dump (FILE *f)
+{
+	if (!f)
+		f = stdout;
+
+	int i = 0;
+	while (msg[i]) {
+		char ch = (char) msg[i];
+		fputc (ch, f);
+		i++;
+	}
+
+	msg [0] = 0;
+}
+
+void flush ()
+{
+#if defined(__WIN32__) && (defined(__arm__) || defined(__aarch64__))
+	MessageBeep (MB_OK);
+#else
+	dump (NULL);
+	fflush (stdout);
+#endif
+}
+
+void print_size (unsigned long size)
+{
+	if (size < 1024) {
+		print_int (size);
+		print (L" B");
+	}
+	else if (size < (1<<20)) {
+		print_int (size >> 10);
+		print (L" kB");
+	} else {
+		print_int (size >> 20);
+		switch ((size >> 18) & 3) {
+		case 1: print (L".25"); break;
+		case 2: print (L".5"); break;
+		case 3: print (L".75"); break;
+		}
+		print (L" MB");
+	}
+}
+
+//============================================================================
+// Timing logic.
+//============================================================================
+
+//----------------------------------------------------------------------------
+// Name:	mytime
+// Purpose:	Reports time in microseconds.
+//----------------------------------------------------------------------------
+unsigned long mytime ()
+{
+#ifndef __WIN32__
+	struct timeval tv;
+	struct timezone tz;
+	memset (&tz, 0, sizeof(struct timezone));
+	gettimeofday (&tv, &tz);
+	return 1000000 * tv.tv_sec + tv.tv_usec;
+#else
+	return 1000 * GetTickCount ();	// accurate enough.
+#endif
+}
+
+//----------------------------------------------------------------------------
+// Name:	calculate_result
+// Purpose:	Calculates and prints a result.
+// Returns:	10 times the number of megabytes per second.
+//----------------------------------------------------------------------------
+int
+calculate_result (unsigned long chunk_size, long long total_count, long diff)
+{
+	if (!diff)
+		error ("Zero time difference.");
+
+// printf ("\nIn calculate_result, chunk_size=%ld, total_count=%lld, diff=%ld\n", chunk_size, total_count, diff);
+	long double result = (long double) chunk_size;
+	result *= (long double) total_count;
+	result *= 1000000.;
+	result /= 1048576.;
+	result /= (long double) diff;
+
+	print_result (result);
+
+	return (long) (10.0 * result);
+}
+
+//============================================================================
+// Tests.
+//============================================================================
+
+//----------------------------------------------------------------------------
+// Name:	do_write
+// Purpose:	Performs write on chunk of memory of specified size.
+//----------------------------------------------------------------------------
+enum {
+	NO_SSE2,
+	SSE2,
+	SSE2_BYPASS,
+};
+
+static void *do_thread_write(void *arg)
+{
+	struct thread_params *params = (struct thread_params *)arg;
+	unsigned long total_count = 0;
+#if defined(__x86_64__) || defined(__aarch64__)
+	unsigned long value = 0x1234567689abcdef;
+#else
+	unsigned long value = 0x12345678;
+#endif
+	cpu_set_t mask;
+
+	CPU_ZERO(&mask);
+	CPU_SET(params->id % cpu_num, &mask);
+
+	if (pthread_setaffinity_np(pthread_self(), sizeof(mask), &mask) < 0)
+		fprintf(stderr, "set thread %d affinity failed\n", params->id);
+
+	while(goon_flag) {
+		total_count += params->loops;
+
+		if (params->random)
+			RandomWriter (params->chunk_ptrs, params->size/256, params->loops, value);
+		else
+			Writer (params->chunk, params->size, params->loops, value);
+	}
+
+	params->loops = total_count;
+
+	 pthread_exit(NULL);
+
+         return NULL;
+}
+
+
+int
+do_write (unsigned long size, int mode, bool random)
+{
+	unsigned char *chunk;
+	unsigned char *chunk0;
+	unsigned long loops;
+	unsigned long long total_count=0;
+	unsigned long diff=0, t0;
+	unsigned long tmp;
+	unsigned long **chunk_ptrs = NULL;
+	struct thread_params *params;
+	pthread_t *tid;
+	int i, rval;
+
+	if (size & 255)
+		error ("do_write(): chunk size is not multiple of 256.");
+
+	params = malloc(sizeof(struct thread_params) * thread_num);
+	if (!params)
+		error ("Out of memory");
+
+	tid = malloc(sizeof(pthread_t) * thread_num);
+	if (!tid)
+		error ("Out of memory");
+
+	//-------------------------------------------------
+	chunk0 = malloc (size+32);
+	chunk = chunk0;
+	if (!chunk)
+		error ("Out of memory");
+
+	tmp = (unsigned long) chunk;
+	if (tmp & 15) {
+		tmp -= (tmp & 15);
+		tmp += 16;
+		chunk = (unsigned char*) tmp;
+	}
+
+	//----------------------------------------
+	// Set up random pointers to chunks.
+	//
+	if (random) {
+		tmp = size/256;
+		chunk_ptrs = (unsigned long**) malloc (sizeof (unsigned long*) * tmp);
+		if (!chunk_ptrs)
+			error ("Out of memory.");
+
+		//----------------------------------------
+		// Store pointers to all chunks into array.
+		//
+		int i;
+		for (i = 0; i < tmp; i++) {
+			chunk_ptrs [i] = (unsigned long*) (chunk + 256 * i);
+		}
+
+		//----------------------------------------
+		// Randomize the array of chunk pointers.
+		//
+		int k = 100;
+		while (k--) {
+			for (i = 0; i < tmp; i++) {
+				int j = rand() % tmp;
+				if (i != j) {
+					unsigned long *ptr = chunk_ptrs [i];
+					chunk_ptrs [i] = chunk_ptrs [j];
+					chunk_ptrs [j] = ptr;
+				}
+			}
+		}
+	}
+
+	//-------------------------------------------------
+	if (random)
+		print (L"Random write ");
+	else
+		print (L"Sequential write ");
+
+	if (mode == SSE2) {
+		print (L"(128-bit), size = ");
+	}
+	else
+	if (mode == SSE2_BYPASS) {
+		print (L"bypassing cache (128-bit), size = ");
+	} else {
+#if defined(__x86_64__) || defined(__aarch64__)
+		print (L"(64-bit), size = ");
+#else
+		print (L"(32-bit), size = ");
+#endif
+	}
+
+	print_size (size);
+	print (L", ");
+
+	loops = (1 << 26) / size;// XX need to adjust for CPU MHz
+
+	tmp = size / thread_num;
+
+	for (i = 0; i < thread_num; i++) {
+		params[i].id = i;
+		params[i].random = random;
+		params[i].size = tmp < 1024 ? size : tmp;
+		if (random)
+			params[i].chunk_ptrs = tmp < 1024 ? chunk_ptrs : chunk_ptrs + i * (tmp / 256);
+		else
+			params[i].chunk = tmp < 1024 ? chunk : chunk + i * tmp;
+		params[i].loops = loops;
+	}
+
+	t0 = mytime ();
+
+	goon_flag = 1;
+
+	for (i = 0; i < thread_num; i++) {
+		rval = pthread_create(&tid[i] ,NULL, do_thread_write, &params[i]);
+		if (rval < 0) {
+			perror("can't create pthread\n");
+			return rval;
+		}
+	}
+
+	usleep(usec_per_test);
+
+	goon_flag = 0;
+
+	for (i = 0; i < thread_num; i++) {
+		pthread_join(tid[i], NULL);
+		total_count += params[i].loops;
+	}
+
+	diff = mytime () - t0;
+
+	total_count /= thread_num;
+
+	print (L"loops = ");
+	print_int (total_count);
+	print (L", ");
+
+	flush ();
+
+	int result = calculate_result (size, total_count, diff);
+	newline ();
+
+	flush ();
+
+	free ((void*)chunk0);
+
+	if (chunk_ptrs)
+		free (chunk_ptrs);
+
+	return result;
+}
+
+static void *do_thread_read(void *arg)
+{
+	struct thread_params *params = (struct thread_params *)arg;
+	unsigned long total_count = 0;
+	cpu_set_t mask;
+
+	CPU_ZERO(&mask);
+	CPU_SET(params->id % cpu_num, &mask);
+
+	if (pthread_setaffinity_np(pthread_self(), sizeof(mask), &mask) < 0)
+		fprintf(stderr, "set thread %d affinity failed\n", params->id);
+
+	while(goon_flag) {
+		total_count += params->loops;
+
+		if (params->random)
+			RandomReader (params->chunk_ptrs, params->size/256, params->loops);
+		else
+			Reader (params->chunk, params->size, params->loops);
+	}
+
+	params->loops = total_count;
+
+	 pthread_exit(NULL);
+
+         return NULL;
+}
+
+//----------------------------------------------------------------------------
+// Name:	do_read
+// Purpose:	Performs sequential read on chunk of memory of specified size.
+//----------------------------------------------------------------------------
+int
+do_read (unsigned long size, bool use_sse2, bool random)
+{
+	unsigned long diff=0;
+	unsigned long long total_count = 0;
+	unsigned char *chunk;
+	unsigned char *chunk0;
+	unsigned long tmp;
+	unsigned long **chunk_ptrs = NULL;
+	unsigned long t0, loops = (1 << 26) / size;	// XX need to adjust for CPU MHz
+	struct thread_params *params;
+	pthread_t *tid;
+	int i, rval;
+
+	if (size & 255)
+		error ("do_read(): chunk size is not multiple of 256.");
+
+	params = malloc(sizeof(struct thread_params) * thread_num);
+	if (!params)
+		error ("Out of memory");
+
+	tid = malloc(sizeof(pthread_t) * thread_num);
+	if (!tid)
+		error ("Out of memory");
+
+	//-------------------------------------------------
+	if (random)
+		print (L"Random read ");
+	else
+		print (L"Sequential read ");
+
+	if (use_sse2) {
+		print (L"(128-bit), size = ");
+	} else {
+#if defined(__x86_64__) || defined(__aarch64__)
+		print (L"(64-bit), size = ");
+#else
+		print (L"(32-bit), size = ");
+#endif
+	}
+
+	print_size (size);
+	print (L", ");
+
+	flush ();
+
+	//-------------------------------------------------
+	chunk0 = chunk = malloc (size+32);
+	if (!chunk)
+		error ("Out of memory");
+
+	memset (chunk, 0, size);
+
+	tmp = (unsigned long) chunk;
+	if (tmp & 15) {
+		tmp -= (tmp & 15);
+		tmp += 16;
+		chunk = (unsigned char*) tmp;
+	}
+
+	//----------------------------------------
+	// Set up random pointers to chunks.
+	//
+	if (random) {
+		int tmp = size/256;
+		chunk_ptrs = (unsigned long**) malloc (sizeof (unsigned long*) * tmp);
+		if (!chunk_ptrs)
+			error ("Out of memory.");
+
+		//----------------------------------------
+		// Store pointers to all chunks into array.
+		//
+		int i;
+		for (i = 0; i < tmp; i++) {
+			chunk_ptrs [i] = (unsigned long*) (chunk + 256 * i);
+		}
+
+		//----------------------------------------
+		// Randomize the array of chunk pointers.
+		//
+		int k = 100;
+		while (k--) {
+			for (i = 0; i < tmp; i++) {
+				int j = rand() % tmp;
+				if (i != j) {
+					unsigned long *ptr = chunk_ptrs [i];
+					chunk_ptrs [i] = chunk_ptrs [j];
+					chunk_ptrs [j] = ptr;
+				}
+			}
+		}
+	}
+
+	tmp = size / thread_num;
+
+	for (i = 0; i < thread_num; i++) {
+		params[i].id = i;
+		params[i].random = random;
+		params[i].size = tmp < 1024 ? size : tmp;
+		if (random)
+			params[i].chunk_ptrs = tmp < 1024 ? chunk_ptrs : chunk_ptrs + i * (tmp / 256);
+		else
+			params[i].chunk = tmp < 1024 ? chunk : chunk + i * tmp;
+		params[i].loops = loops;
+	}
+
+	t0 = mytime ();
+
+	goon_flag = 1;
+
+	for (i = 0; i < thread_num; i++) {
+		rval = pthread_create(&tid[i] ,NULL, do_thread_read, &params[i]);
+		if (rval < 0) {
+			perror("can't create pthread\n");
+			return rval;
+		}
+	}
+
+	usleep(usec_per_test);
+
+	goon_flag = 0;
+
+	for (i = 0; i < thread_num; i++) {
+		pthread_join(tid[i], NULL);
+		total_count += params[i].loops;
+	}
+
+	diff = mytime () - t0;
+
+	total_count /= thread_num;
+
+	print (L"loops = ");
+	print_int (total_count);
+	print (L", ");
+
+	int result = calculate_result (size, total_count, diff);
+	newline ();
+
+	flush ();
+
+	free (chunk0);
+
+	if (chunk_ptrs)
+		free (chunk_ptrs);
+
+	free(params);
+	free(tid);
+
+	return result;
+}
+
+
+
+//----------------------------------------------------------------------------
+// Name:	do_copy
+// Purpose:	Performs sequential memory copy.
+//----------------------------------------------------------------------------
+int
+do_copy (unsigned long size, int mode)
+{
+	unsigned long loops;
+	unsigned long long total_count = 0;
+	unsigned long t0, diff=0;
+	unsigned char *chunk_src;
+	unsigned char *chunk_dest;
+	unsigned char *chunk_src0;
+	unsigned char *chunk_dest0;
+	unsigned long tmp;
+
+	if (size & 255)
+		error ("do_copy(): chunk size is not multiple of 256.");
+
+	//-------------------------------------------------
+	chunk_src0 = chunk_src = malloc (size+32);
+	if (!chunk_src)
+		error ("Out of memory");
+	chunk_dest0 = chunk_dest = malloc (size+32);
+	if (!chunk_dest)
+		error ("Out of memory");
+
+	memset (chunk_src, 100, size);
+	memset (chunk_dest, 200, size);
+
+	tmp = (unsigned long) chunk_src;
+	if (tmp & 15) {
+		tmp -= (tmp & 15);
+		tmp += 16;
+		chunk_src = (unsigned char*) tmp;
+	}
+	tmp = (unsigned long) chunk_dest;
+	if (tmp & 15) {
+		tmp -= (tmp & 15);
+		tmp += 16;
+		chunk_dest = (unsigned char*) tmp;
+	}
+
+	//-------------------------------------------------
+	print (L"Sequential copy ");
+
+	if (mode == SSE2) {
+		print (L"(128-bit), size = ");
+	}
+	else {
+#if defined(__x86_64__) || defined(__aarch64__)
+		print (L"(64-bit), size = ");
+#else
+		print (L"(32-bit), size = ");
+#endif
+	}
+
+	print_size (size);
+	print (L", ");
+
+	flush ();
+
+	loops = (1 << 26) / size;	// XX need to adjust for CPU MHz
+
+	t0 = mytime ();
+
+	while (diff < usec_per_test) {
+		total_count += loops;
+
+#if !defined(__arm__) && !defined(__aarch64__)
+		if (mode == SSE2)
+			CopySSE (chunk_dest, chunk_src, size, loops);
+#if 0
+		else
+			Copy (chunk_dest, chunk_src, size, loops);
+#endif
+#endif
+
+		diff = mytime () - t0;
+	}
+
+	print (L"loops = ");
+	print_int (total_count);
+	print (L", ");
+
+	int result = calculate_result (size, total_count, diff);
+	newline ();
+
+	flush ();
+
+	free (chunk_src0);
+	free (chunk_dest0);
+
+	return result;
+}
+
+
+//----------------------------------------------------------------------------
+// Name:	fb_readwrite
+// Purpose:	Performs sequential read & write tests on framebuffer memory.
+//----------------------------------------------------------------------------
+#if defined(__linux__) && defined(FBIOGET_FSCREENINFO)
+void
+fb_readwrite (bool use_sse2)
+{
+	//unsigned long counter, total_count;
+  unsigned long total_count;
+	unsigned long length;
+	unsigned long diff, t0;
+	static struct fb_fix_screeninfo fi;
+	static struct fb_var_screeninfo vi;
+	unsigned long *fb = NULL;
+	//unsigned long datum;
+	int fd;
+	//register unsigned long foo;
+#if defined(__x86_64__) || defined(__aarch64__)
+	unsigned long value = 0x1234567689abcdef;
+#else
+	unsigned long value = 0x12345678;
+#endif
+
+	//-------------------------------------------------
+
+	fd = open ("/dev/fb0", O_RDWR);
+	if (fd < 0)
+		fd = open ("/dev/fb/0", O_RDWR);
+	if (fd < 0) {
+		println (L"Cannot open framebuffer device.");
+		return;
+	}
+
+	if (ioctl (fd, FBIOGET_FSCREENINFO, &fi)) {
+		close (fd);
+		println (L"Cannot get framebuffer info");
+		return;
+	}
+	else
+	if (ioctl (fd, FBIOGET_VSCREENINFO, &vi)) {
+		close (fd);
+		println (L"Cannot get framebuffer info");
+		return;
+	}
+	else
+	{
+		if (fi.visual != FB_VISUAL_TRUECOLOR &&
+		    fi.visual != FB_VISUAL_DIRECTCOLOR ) {
+			close (fd);
+			println (L"Need direct/truecolor framebuffer device.");
+			return;
+		} else {
+			unsigned long fblen;
+
+			print (L"Framebuffer resolution: ");
+			print_int (vi.xres);
+			print (L"x");
+			print_int (vi.yres);
+			print (L", ");
+			print_int (vi.bits_per_pixel);
+			println (L" bpp\n");
+
+			fb = (unsigned long*) fi.smem_start;
+			fblen = fi.smem_len;
+
+			fb = mmap (fb, fblen,
+				PROT_WRITE | PROT_READ,
+				MAP_SHARED, fd, 0);
+			if (fb == MAP_FAILED) {
+				close (fd);
+				println (L"Cannot access framebuffer memory.");
+				return;
+			}
+		}
+	}
+
+	//-------------------
+	// Use only the upper half of the display.
+	//
+	length = FB_SIZE;
+
+	//-------------------
+	// READ
+	//
+	print (L"Framebuffer memory sequential read ");
+	flush ();
+
+	t0 = mytime ();
+
+	total_count = FBLOOPS_R;
+
+#if !defined(__arm__) && !defined(__aarch64__)
+	if (use_sse2)
+		ReaderSSE2 (fb, length, FBLOOPS_R);
+	else
+#endif
+		Reader (fb, length, FBLOOPS_R);
+
+	diff = mytime () - t0;
+
+	calculate_result (length, total_count, diff);
+	newline ();
+
+	//-------------------
+	// WRITE
+	//
+	print (L"Framebuffer memory sequential write ");
+	flush ();
+
+	t0 = mytime ();
+
+	total_count = FBLOOPS_W;
+
+#if !defined(__arm__) && !defined(__aarch64__)
+	if (use_sse2)
+		WriterSSE2_bypass (fb, length, FBLOOPS_W, value);
+	else
+#endif
+		Writer (fb, length, FBLOOPS_W, value);
+
+	diff = mytime () - t0;
+
+	calculate_result (length, total_count, diff);
+	newline ();
+}
+#endif
+
+//----------------------------------------------------------------------------
+// Name:	register_test
+// Purpose:	Determines bandwidth of register-to-register transfers.
+//----------------------------------------------------------------------------
+void
+register_test ()
+{
+	long long total_count = 0;
+	unsigned long t0;
+	unsigned long diff = 0;
+
+	//--------------------------------------
+#if defined(__x86_64__) || defined(__aarch64__)
+	print (L"Main register to main register transfers (64-bit) ");
+#else
+	print (L"Main register to main register transfers (32-bit) ");
+#endif
+	flush ();
+#define REGISTER_COUNT 10000
+
+	t0 = mytime ();
+	while (diff < usec_per_test)
+	{
+		RegisterToRegister (REGISTER_COUNT);
+		total_count += REGISTER_COUNT;
+
+		diff = mytime () - t0;
+	}
+
+	calculate_result (256, total_count, diff);
+	newline ();
+	flush ();
+
+#if !defined(__arm__) && !defined(__aarch64__)
+	//--------------------------------------
+#ifdef __x86_64__
+	print (L"Main register to vector register transfers (64-bit) ");
+#else
+	print (L"Main register to vector register transfers (32-bit) ");
+#endif
+	flush ();
+#define VREGISTER_COUNT 3333
+
+	t0 = mytime ();
+	diff = 0;
+	total_count = 0;
+	while (diff < usec_per_test)
+	{
+		RegisterToVector (VREGISTER_COUNT);
+		total_count += VREGISTER_COUNT;
+
+		diff = mytime () - t0;
+	}
+
+	calculate_result (256, total_count, diff);
+	newline ();
+	flush ();
+
+	//--------------------------------------
+#ifdef __x86_64__
+	print (L"Vector register to main register transfers (64-bit) ");
+#else
+	print (L"Vector register to main register transfers (32-bit) ");
+#endif
+	flush ();
+
+	t0 = mytime ();
+	diff = 0;
+	total_count = 0;
+	while (diff < usec_per_test)
+	{
+		VectorToRegister (VREGISTER_COUNT);
+		total_count += VREGISTER_COUNT;
+
+		diff = mytime () - t0;
+	}
+
+	calculate_result (256, total_count, diff);
+	newline ();
+	flush ();
+
+	//--------------------------------------
+	print (L"Vector register to vector register transfers (128-bit) ");
+	flush ();
+
+	t0 = mytime ();
+	diff = 0;
+	total_count = 0;
+	while (diff < usec_per_test)
+	{
+		VectorToVector (VREGISTER_COUNT);
+		total_count += VREGISTER_COUNT;
+
+		diff = mytime () - t0;
+	}
+
+	calculate_result (256, total_count, diff);
+	newline ();
+	flush ();
+
+	//--------------------------------------
+	if (use_sse4) {
+		print (L"Vector 8-bit datum to main register transfers ");
+		flush ();
+
+		t0 = mytime ();
+		diff = 0;
+		total_count = 0;
+		while (diff < usec_per_test)
+		{
+			Vector8ToRegister (VREGISTER_COUNT);
+			total_count += VREGISTER_COUNT;
+
+			diff = mytime () - t0;
+		}
+
+		calculate_result (256, total_count, diff);
+		newline ();
+		flush ();
+	}
+
+	//--------------------------------------
+	print (L"Vector 16-bit datum to main register transfers ");
+	flush ();
+
+	t0 = mytime ();
+	diff = 0;
+	total_count = 0;
+	while (diff < usec_per_test)
+	{
+		Vector16ToRegister (VREGISTER_COUNT);
+		total_count += VREGISTER_COUNT;
+
+		diff = mytime () - t0;
+	}
+
+	calculate_result (256, total_count, diff);
+	newline ();
+	flush ();
+
+	//--------------------------------------
+	if (use_sse4) {
+		print (L"Vector 32-bit datum to main register transfers ");
+		flush ();
+
+		t0 = mytime ();
+		diff = 0;
+		total_count = 0;
+		while (diff < usec_per_test)
+		{
+			Vector32ToRegister (VREGISTER_COUNT);
+			total_count += VREGISTER_COUNT;
+
+			diff = mytime () - t0;
+		}
+
+		calculate_result (256, total_count, diff);
+		newline ();
+		flush ();
+	}
+
+#ifdef __x86_64__
+	//--------------------------------------
+	print (L"Vector 64-bit datum to main register transfers ");
+	flush ();
+
+	t0 = mytime ();
+	diff = 0;
+	total_count = 0;
+	while (diff < usec_per_test)
+	{
+		Vector64ToRegister (VREGISTER_COUNT);
+		total_count += VREGISTER_COUNT;
+
+		diff = mytime () - t0;
+	}
+
+	calculate_result (256, total_count, diff);
+	newline ();
+	flush ();
+#endif
+
+	//--------------------------------------
+	if (use_sse4) {
+		print (L"Main register 8-bit datum to vector register transfers ");
+		flush ();
+
+		t0 = mytime ();
+		diff = 0;
+		total_count = 0;
+		while (diff < usec_per_test)
+		{
+			Register8ToVector (VREGISTER_COUNT);
+			total_count += VREGISTER_COUNT;
+
+			diff = mytime () - t0;
+		}
+
+		calculate_result (256, total_count, diff);
+		newline ();
+		flush ();
+	}
+
+	//--------------------------------------
+	print (L"Main register 16-bit datum to vector register transfers ");
+	flush ();
+
+	t0 = mytime ();
+	diff = 0;
+	total_count = 0;
+	while (diff < usec_per_test)
+	{
+		Register16ToVector (VREGISTER_COUNT);
+		total_count += VREGISTER_COUNT;
+
+		diff = mytime () - t0;
+	}
+
+	calculate_result (256, total_count, diff);
+	newline ();
+	flush ();
+
+	//--------------------------------------
+	if (use_sse4) {
+		print (L"Main register 32-bit datum to vector register transfers ");
+		flush ();
+
+		t0 = mytime ();
+		diff = 0;
+		total_count = 0;
+		while (diff < usec_per_test)
+		{
+			Register32ToVector (VREGISTER_COUNT);
+			total_count += VREGISTER_COUNT;
+
+			diff = mytime () - t0;
+		}
+
+		calculate_result (256, total_count, diff);
+		newline ();
+		flush ();
+	}
+
+#ifdef __x86_64__
+	//--------------------------------------
+	print (L"Main register 64-bit datum to vector register transfers ");
+	flush ();
+
+	t0 = mytime ();
+	diff = 0;
+	total_count = 0;
+	while (diff < usec_per_test)
+	{
+		Register64ToVector (VREGISTER_COUNT);
+		total_count += VREGISTER_COUNT;
+
+		diff = mytime () - t0;
+	}
+
+	calculate_result (256, total_count, diff);
+	newline ();
+	flush ();
+#endif
+#endif
+}
+
+//----------------------------------------------------------------------------
+// Name:	stack_test
+// Purpose:	Determines bandwidth of stack-to/from-register transfers.
+//----------------------------------------------------------------------------
+void
+stack_test ()
+{
+	long long total_count = 0;
+	unsigned long t0;
+	unsigned long diff = 0;
+
+#if defined(__x86_64__) || defined(__aarch64__)
+	print (L"Stack-to-register transfers (64-bit) ");
+#else
+	print (L"Stack-to-register transfers (32-bit) ");
+#endif
+	flush ();
+
+	//--------------------------------------
+	diff = 0;
+	total_count = 0;
+	t0 = mytime ();
+	while (diff < usec_per_test)
+	{
+		StackReader (REGISTER_COUNT);
+		total_count += REGISTER_COUNT;
+
+		diff = mytime () - t0;
+	}
+
+	calculate_result (256, total_count, diff);
+	newline ();
+	flush ();
+
+#if defined(__x86_64__) || defined(__aarch64__)
+	print (L"Register-to-stack transfers (64-bit) ");
+#else
+	print (L"Register-to-stack transfers (32-bit) ");
+#endif
+	flush ();
+
+	//--------------------------------------
+	diff = 0;
+	total_count = 0;
+	t0 = mytime ();
+	while (diff < usec_per_test)
+	{
+		StackWriter (REGISTER_COUNT);
+		total_count += REGISTER_COUNT;
+
+		diff = mytime () - t0;
+	}
+
+	calculate_result (256, total_count, diff);
+	newline ();
+	flush ();
+}
+
+//----------------------------------------------------------------------------
+// Name:	library_test
+// Purpose:	Performs C library tests (memset, memcpy).
+//----------------------------------------------------------------------------
+void
+library_test ()
+{
+	char *a1, *a2;
+	unsigned long t, t0;
+	int i;
+
+
+#if defined(__WIN32__) && (defined(__arm__) || defined(__aarch64__))
+	#define NT_SIZE (1024*1024)
+	#define NT_SIZE2 (50)
+#elif !defined(__WIN32__) && (defined(__arm__) || defined(__aarch64__))
+#if defined(DRAM_SIZE_SMALL)
+	#define NT_SIZE (16*1024*1024)
+#else
+	#define NT_SIZE (32*1024*1024)
+#endif
+	#define NT_SIZE2 (50)
+#else
+	#define NT_SIZE (64*1024*1024)
+	#define NT_SIZE2 (100)
+#endif
+
+	a1 = malloc (NT_SIZE);
+	if (!a1)
+		error ("Out of memory");
+
+	a2 = malloc (NT_SIZE);
+	if (!a2)
+		error ("Out of memory");
+
+	//--------------------------------------
+	t0 = mytime ();
+	for (i=0; i<NT_SIZE2; i++) {
+		memset (a1, i, NT_SIZE);
+	}
+	t = mytime ();
+
+	print (L"Library: memset ");
+	calculate_result (NT_SIZE, NT_SIZE2, t-t0);
+	newline ();
+
+	flush ();
+
+	//--------------------------------------
+	t0 = mytime ();
+	for (i=0; i<NT_SIZE2; i++) {
+		memcpy (a2, a1, NT_SIZE);
+	}
+	t = mytime ();
+
+	print (L"Library: memcpy ");
+	calculate_result (NT_SIZE, NT_SIZE2, t-t0);
+	newline ();
+
+	flush ();
+
+	free (a1);
+	free (a2);
+}
+
+//----------------------------------------------------------------------------
+// Name:	network_test_core
+// Purpose:	Performs the network test, talking to and receiving data
+//		back from a transponder node.
+// Note:	Port number specified using server:# notation.
+// Returns:	-1 on error, else the network duration in microseconds.
+//----------------------------------------------------------------------------
+long
+network_test_core (const char *net_path, char *chunk,
+			unsigned long chunk_size,
+			unsigned long count)
+{
+	char hostname [PATH_MAX];
+	char *s;
+	int port = NETWORK_DEFAULT_PORTNUM ;
+	strcpy (hostname, net_path);
+	if ((s = strchr (hostname, ':'))) {
+		*s++ = 0;
+		port = atoi (s);
+	}
+
+	struct hostent* host = gethostbyname (hostname);
+	if (!host)
+		return -1;
+
+	char *host_ip = inet_ntoa (*(struct in_addr *)*host->h_addr_list);
+	int sock = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP);
+
+	struct sockaddr_in addr;
+	addr.sin_family = AF_INET;
+	addr.sin_addr.s_addr = inet_addr(host_ip);
+	addr.sin_port = htons(port);
+
+	if (connect (sock, (struct sockaddr*) &addr, sizeof (struct sockaddr)))
+	{
+		// perror ("connect");
+		close (sock);
+		return -1;
+	}
+
+	//------------------------------------
+	// Send all of our data.
+	//
+	unsigned long t0 = mytime ();
+	int i;
+	for (i = 0; i < count; i++)
+		send (sock, chunk, chunk_size, 0);
+
+#if 0
+	//------------------------------------
+	// Set nonblocking mode.
+	//
+	int opt = 1;
+	ioctl (sock, FIONBIO, &opt);
+#endif
+
+	//------------------------------------
+	// Read the response.
+	//
+	char *buffer = malloc (chunk_size);
+	if (!buffer) {
+		close (sock);
+		// perror ("malloc");
+		return -1;
+	}
+	int amount = recv (sock, buffer, chunk_size, 0);
+	if (amount <= 0) {
+		close (sock);
+		//perror ("recv");
+		return -1;
+	}
+
+	long t = mytime () - t0;
+	close (sock);
+	free (buffer);
+	return t;
+}
+
+//----------------------------------------------------------------------------
+// Name:	ip_to_str
+//----------------------------------------------------------------------------
+void
+ip_to_str (unsigned long addr, char *str)
+{
+	if (!str)
+		return;
+
+	unsigned short a = 0xff & addr;
+	unsigned short b = 0xff & (addr >> 8);
+	unsigned short c = 0xff & (addr >> 16);
+	unsigned short d = 0xff & (addr >> 24);
+	sprintf (str, "%u.%u.%u.%u", a,b,c,d);
+}
+
+//----------------------------------------------------------------------------
+// Name:	network_transponder
+// Purpose:	Act as a transponder, receiving chunks of data and sending
+//		back an acknowledgement once the enture chunk is read.
+// Returns:	False if a problem occurs setting up the network socket.
+//----------------------------------------------------------------------------
+bool
+network_transponder ()
+{
+	struct sockaddr_in sin, from;
+
+	//------------------------------
+	// Get listening socket for port.
+	// Then listen on given port#.
+	//
+	sin.sin_family = AF_INET;
+	sin.sin_addr.s_addr = htonl(INADDR_ANY);
+	sin.sin_port = htons(NETWORK_DEFAULT_PORTNUM);
+	int listensock;
+	if ((listensock = socket (AF_INET, SOCK_STREAM, 0)) < 0)  {
+		perror ("socket");
+		return false;
+	}
+	if (bind (listensock, (struct sockaddr*) &sin, sizeof(sin)) < 0) {
+		perror ("bind");
+		close (listensock);
+		return false;
+	}
+	if (listen (listensock, 500) < 0) {
+		perror ("listen");
+		close (listensock);
+		return false;
+	}
+
+	bool done = false;
+	while (!done) {
+		//----------------------------------------
+		// Wait for a client to contact us.
+		//
+		socklen_t len = sizeof (struct sockaddr);
+		int sock = accept (listensock, (struct sockaddr*) &from, &len);
+		if (sock < 0) {
+			perror ("accept");
+			close (listensock);
+			return false;
+		}
+
+		if (len != sizeof (struct sockaddr_in)) {
+			close (sock);
+			close (listensock);
+			return false;
+		}
+
+#if 0
+		unsigned long ipaddr = from.sin_addr.s_addr;
+		char ipstring[30];
+		ip_to_str (ipaddr, ipstring);
+		fprintf (stderr, "Incoming connection from %s\n", ipstring);
+#endif
+
+		char chunk [NETWORK_CHUNK_SIZE+1];
+		long n_chunks = 0;
+		int amount_read = read (sock, chunk, NETWORK_CHUNK_SIZE);
+		chunk [amount_read] = 0;
+		if (1 != sscanf (chunk, "%ld", &n_chunks)) {
+			close (sock);
+			close (listensock);
+			return false;
+		}
+
+		//----------------------------------------
+		// If the leader sends us a chunk count of
+		// -99, this indicates that we should exit.
+		//
+		if (n_chunks == -99) {
+			close (sock);
+			close (listensock);
+			return true;
+		}
+
+//		printf ("Reading %lu chunks of %d bytes...\n", n_chunks, NETWORK_CHUNK_SIZE);
+
+		unsigned long long remaining = n_chunks;
+		remaining *= NETWORK_CHUNK_SIZE;
+
+//		printf ("remaining="); dump_hex64(remaining); puts("");
+
+		remaining -= amount_read;
+		while (remaining > 0) {
+			amount_read = read (sock, chunk, NETWORK_CHUNK_SIZE);
+			remaining -= amount_read;
+
+			if (amount_read < 0) {
+				perror ("read");
+				break;
+			} else
+			if (!amount_read)
+				break;
+		}
+
+		char *foo = "OK.\n\n";
+		write (sock, foo, 4);
+		close (sock);
+	}
+
+	return true;
+}
+
+//----------------------------------------------------------------------------
+// Name:	network_test
+//----------------------------------------------------------------------------
+bool
+network_test (char **destinations, int n_destinations)
+{
+	int i;
+
+	//----------------------------------------
+	// The memory chunk starts with a 12-byte
+	// length of the overall send size.
+	// The memory chunk will have a list of
+	// the destinations in it.
+	// In future, there will be a mechanism
+	// for testing bandwidth between all nodes,
+	// not just the leader & each of the
+	// transponders.
+	//
+	char chunk [NETWORK_CHUNK_SIZE];
+	memset (chunk, 0, NETWORK_CHUNK_SIZE);
+	sprintf (chunk, "000000000000\n%d\n", n_destinations);
+	for (i = 0; i < n_destinations; i++) {
+		char *s = destinations [i];
+		int chunk_len = strlen (chunk);
+		int len = strlen (s);
+		if (len + chunk_len < NETWORK_CHUNK_SIZE-1) {
+			//----------------------------------------
+			// "transp" indicates that the given node
+			// has not yet been a leader.
+			// In future, "done" will indicate it has.
+			//
+			sprintf (chunk + chunk_len, "%s %s\n", s, "transp");
+		}
+	}
+
+	//----------------------------------------
+	// For each destination, run the test.
+	//
+	for (i = 0; i < n_destinations; i++) {
+		int j = 0;
+		bool problem = false;
+
+		char *hostname = destinations[i];
+		printf ("Bandwidth sending to %s:\n", hostname);
+
+		//----------------------------------------
+		// Send from 8kB up to 32 MB of data.
+		//
+		while (!problem && j < 13) {
+			unsigned long chunk_count = 1 << j;
+			unsigned long long amt_to_send = chunk_count;
+			amt_to_send *= NETWORK_CHUNK_SIZE;
+
+			if (!amt_to_send) // unlikely
+				break;
+
+			//----------------------------------------
+			// Write the overall send size into the
+			// 1st line of the chunk so that the
+			// transponder knows how large the send
+			// is without guessing.
+			//
+			sprintf (chunk, "%11lu", chunk_count);
+			chunk[11] = ' ';
+
+			//--------------------
+			// Send the data.
+			//
+			long duration = network_test_core (hostname,
+				chunk, NETWORK_CHUNK_SIZE, chunk_count);
+			if (duration == -1) {
+				problem = true;
+				fprintf (stderr, "\nCan't connect to %s\n", hostname);
+			} else {
+				unsigned long amt_in_kb = amt_to_send / 1024;
+				unsigned long amt_in_mb = amt_to_send / 1048576;
+				if (!amt_in_mb) {
+					printf ("\tSent %lu kB...", amt_in_kb);
+				} else {
+					printf ("\tSent %lu MB...", amt_in_mb);
+				}
+
+				//------------------------------
+				// Calculate rate in MB/sec.
+				//
+				// Get total # bytes.
+				unsigned long long tmp = NETWORK_CHUNK_SIZE;
+				tmp *= chunk_count;
+
+				// Get total bytes per second.
+				tmp *= 1000000;
+				tmp /= duration;
+
+				// Bytes to megabytes.
+				tmp /= 1000;
+				tmp /= 10;
+				unsigned long whole = tmp / 100;
+				unsigned long frac = tmp % 100;
+				printf ("%lu.%02lu MB/second\n", whole, frac);
+			}
+			j++;
+		}
+
+		puts ("");
+	}
+
+	return true;
+}
+
+//----------------------------------------------------------------------------
+// Name:	usage
+//----------------------------------------------------------------------------
+void
+usage ()
+{
+	printf ("Usage for memory tests: bandwidth [--quick] [--thread N] [--chunk-size N]\n");
+	printf ("Usage for starting network tests: bandwidth --network <ipaddr1> [<ipaddr2...]\n");
+	printf ("Usage for receiving network tests: bandwidth --transponder\n");
+	exit (0);
+}
+
+//----------------------------------------------------------------------------
+// Name:	main
+//----------------------------------------------------------------------------
+int
+main (int argc, char **argv)
+{
+	int i, j, chunk_size;
+
+	--argc;
+	++argv;
+
+	strcpy (graph_title, TITLE);
+
+	bool network_mode = false;
+	bool network_leader = false; // false => transponder
+	int network_destinations_size = 0;
+	int n_network_destinations = 0;
+	char **network_destinations = NULL;
+
+	i = 0;
+	while (i < argc) {
+		char *s = argv [i++];
+		if (!strcmp ("--network", s)) {
+			network_mode = true;
+			network_leader = true;
+			network_destinations_size = 20;
+			network_destinations = (char**) malloc (network_destinations_size * sizeof (char*));
+		}
+		else
+		if (!strcmp ("--transponder", s)) {
+			network_mode = true;
+		}
+		else
+		if (!strcmp ("--slow", s)) {
+			usec_per_test=20000000;	// 20 seconds per test.
+		}
+		else
+		if (!strcmp ("--quick", s)) {
+			usec_per_test = 250000;	// 0.25 seconds per test.
+		}
+		else
+		if (!strcmp ("--nosse2", s)) {
+			use_sse2 = false;
+			use_sse4 = false;
+		}
+		else
+		if (!strcmp ("--nosse4", s)) {
+			use_sse4 = false;
+		}
+		else
+		if (!strcmp ("--help", s)) {
+			usage ();
+		}
+		else
+		if (!strcmp ("--title", s) && i != argc) {
+			sprintf (graph_title, "%s -- %s", TITLE, argv[i++]);
+		}
+		else
+		if (!strcmp ("--thread", s)) {
+			int n = 0;
+			thread_num = atoi(argv[i++]);
+			for (j = 0; j < 32; j++)
+				n += (thread_num >> j) & 0x1;
+			if (n > 1)
+				error("thread_num must be power of 2\n");
+		}
+		else
+		if (!strcmp ("--chunk-size", s)) {
+			chunk_size = strtoul(argv[i++], NULL, 0);
+			for (j = 0; j < sizeof(chunk_sizes) / sizeof(chunk_sizes[0]); j++) {
+				if (chunk_size <= chunk_sizes[j])
+					break;
+			}
+			chunk_index = j;
+		}
+		else {
+			if (!network_mode || !network_leader)
+				usage ();
+
+			if ('-' == *s)
+				usage ();
+
+			if (n_network_destinations >= network_destinations_size) {
+				network_destinations_size *= 2;
+				int newsize = sizeof(char*) * network_destinations_size;
+				network_destinations = realloc (network_destinations,
+					newsize);
+			}
+
+			network_destinations [n_network_destinations++] = strdup (s);
+		}
+	}
+
+	msg[0] = 0;
+
+#if !(defined(__WIN32__) && (defined(__arm__) || defined(__aarch64__)))
+	printf ("This is bandwidth version %s.\n", VERSION);
+	printf ("Copyright (C) 2005-2010 by Zack T Smith.\n\n");
+	printf ("This software is covered by the GNU Public License.\n");
+	printf ("It is provided AS-IS, use at your own risk.\n");
+	printf ("See the file COPYING for more information.\n\n");
+	fflush (stdout);
+#else
+	println (L"(C) 2010 by Zack Smith");
+	println (L"Under GNU Public License");
+	println (L"Use at your own risk.");
+#endif
+
+	//----------------------------------------
+	// If network mode selected, enter it now.
+	// Currently cannot combine memory tests
+	// & network tests.
+	//
+	if (network_mode) {
+		if (network_leader) {
+			network_test (network_destinations, n_network_destinations);
+		} else {
+			network_transponder ();
+		}
+
+		puts ("Done.");
+		return 0;
+	}
+
+#if !defined(__arm__) && !defined(__aarch64__)
+	if (!has_sse2 ()) {
+		puts ("Processor does not have SSE2.");
+		use_sse2 = false;
+		use_sse4 = false;
+	}
+
+#ifdef __x86_64__
+	if (use_sse2)
+		println (L"Using 128-bit and 64-bit data transfers.");
+	else
+		println (L"Using 64-bit data transfers.");
+#else
+	if (use_sse2)
+		println (L"Using 128-bit and 32-bit data transfers.");
+	else
+		println (L"Using 32-bit data transfers.");
+#endif
+
+#else
+
+#if defined(__aarch64__)
+	println (L"Using 64-bit transfers.");
+#else
+	println (L"Using 32-bit transfers.");
+#endif
+
+	use_sse2 = false;
+#endif
+
+	println (L"Notation: kB = 1024 B, MB = 1048576 B.");
+
+	flush ();
+
+	//------------------------------------------------------------
+	// Attempt to obtain information about the CPU.
+	//
+#ifdef __linux__
+	struct stat st;
+	if (!stat ("/proc/cpuinfo", &st)) {
+#define TMPFILE "/tmp/bandw_tmp"
+		unlink (TMPFILE);
+		if (-1 == system ("grep MHz /proc/cpuinfo | uniq | sed \"s/[\\t\\n: a-zA-Z]//g\" > "TMPFILE))
+			perror ("system");
+
+		FILE *f = fopen (TMPFILE, "r");
+		if (f) {
+			float cpu_speed = 0.0;
+
+			if (1 == fscanf (f, "%g", &cpu_speed)) {
+				puts ("");
+				printf ("CPU speed is %g MHz.\n", cpu_speed);
+			}
+			fclose (f);
+		}
+
+#if !defined(__arm__) && !defined(__aarch64__)
+		unlink (TMPFILE);
+		if (-1 == system ("grep -i sse4 /proc/cpuinfo > "TMPFILE))
+			perror ("system");
+
+		if (!stat (TMPFILE, &st)) {
+			if (st.st_size < 2) {
+				use_sse4 = false;
+				puts ("Processor lacks SSE4.");
+			}
+		}
+
+		if (!use_sse2) {
+			unlink (TMPFILE);
+			if (-1 == system ("grep -i sse2 /proc/cpuinfo > "TMPFILE))
+				perror ("system");
+
+			if (!stat (TMPFILE, &st)) {
+				if (st.st_size < 2) {
+					use_sse2 = false;
+					puts ("Processor lacks SSE2.");
+				}
+			}
+		}
+#endif
+	} else {
+		printf ("CPU information is not available (/proc/cpuinfo).\n");
+	}
+
+	cpu_num = sysconf(_SC_NPROCESSORS_CONF);
+	printf("System has %d processor(s)\n", cpu_num);
+
+	fflush (stdout);
+#endif
+
+	graph = BMP_new (graph_width, graph_height);
+	graph_init ();
+
+#if !defined(__arm__) && !defined(__aarch64__)
+	//------------------------------------------------------------
+	// SSE2 sequential reads.
+	//
+	if (use_sse2) {
+		graph_new_line ("Sequential 128-bit reads", RGB_RED);
+
+		newline ();
+
+		i = chunk_index;
+		while ((chunk_size = chunk_sizes [i++])) {
+			int amount = do_read (chunk_size, true, false);
+
+			graph_add_point (chunk_size, amount);
+		}
+	}
+
+	//------------------------------------------------------------
+	// SSE2 random reads.
+	//
+	if (use_sse2) {
+		graph_new_line ("Random 128-bit reads", RGB_MAROON);
+
+		newline ();
+		srand (time (NULL));
+
+		i = chunk_index;
+		while ((chunk_size = chunk_sizes [i++])) {
+			int amount = do_read (chunk_size, true, true);
+
+			graph_add_point (chunk_size, amount);
+		}
+	}
+
+	//------------------------------------------------------------
+	// SSE2 sequential writes that do not bypass the caches.
+	//
+	if (use_sse2) {
+		graph_new_line ("Sequential 128-bit cache writes", RGB_PURPLE);
+
+		newline ();
+
+		i = chunk_index;
+		while ((chunk_size = chunk_sizes [i++])) {
+			int amount = do_write (chunk_size, SSE2, false);
+
+			graph_add_point (chunk_size, amount);
+		}
+	}
+
+	//------------------------------------------------------------
+	// SSE2 random writes that do not bypass the caches.
+	//
+	if (use_sse2) {
+		graph_new_line ("Random 128-bit cache writes", RGB_NAVYBLUE);
+
+		newline ();
+		srand (time (NULL));
+
+		i = chunk_index;
+		while ((chunk_size = chunk_sizes [i++])) {
+			int amount = do_write (chunk_size, SSE2, true);
+
+			graph_add_point (chunk_size, amount);
+		}
+	}
+
+	//------------------------------------------------------------
+	// SSE2 sequential writes that do bypass the caches.
+	//
+	if (use_sse2) {
+		graph_new_line ("Sequential 128-bit bypassing writes", RGB_DARKORANGE);
+
+		newline ();
+
+		i = chunk_index;
+		while ((chunk_size = chunk_sizes [i++])) {
+			int amount = do_write (chunk_size, SSE2_BYPASS, false);
+
+			graph_add_point (chunk_size, amount);
+		}
+	}
+
+	//------------------------------------------------------------
+	// SSE2 random writes that bypass the caches.
+	//
+	if (use_sse2) {
+		graph_new_line ("Random 128-bit bypassing writes", RGB_LEMONYELLOW);
+
+		newline ();
+		srand (time (NULL));
+
+		i = chunk_index;
+		while ((chunk_size = chunk_sizes [i++])) {
+			int amount = do_write (chunk_size, SSE2_BYPASS, true);
+
+			graph_add_point (chunk_size, amount);
+		}
+	}
+#endif
+
+	//------------------------------------------------------------
+	// Sequential non-SSE2 reads.
+	//
+	newline ();
+#if defined(__x86_64__) || defined(__aarch64__)
+	graph_new_line ("Sequential 64-bit reads", RGB_BLUE);
+#else
+	graph_new_line ("Sequential 32-bit reads", RGB_BLUE);
+#endif
+
+	i = chunk_index;
+	while ((chunk_size = chunk_sizes [i++])) {
+		int amount = do_read (chunk_size, false, false);
+
+		graph_add_point (chunk_size, amount);
+	}
+
+	//------------------------------------------------------------
+	// Random non-SSE2 reads.
+	//
+	newline ();
+#if defined(__x86_64__) || defined(__aarch64__)
+	graph_new_line ("Random 64-bit reads", RGB_CYAN);
+#else
+	graph_new_line ("Random 32-bit reads", RGB_CYAN);
+#endif
+	srand (time (NULL));
+
+	i = chunk_index;
+	while ((chunk_size = chunk_sizes [i++])) {
+		int amount = do_read (chunk_size, false, true);
+
+		graph_add_point (chunk_size, amount);
+	}
+
+	//------------------------------------------------------------
+	// Sequential non-SSE2 writes.
+	//
+#if defined(__x86_64__) || defined(__aarch64__)
+	graph_new_line ("Sequential 64-bit writes", RGB_DARKGREEN);
+#else
+	graph_new_line ("Sequential 32-bit writes", RGB_DARKGREEN);
+#endif
+
+	newline ();
+
+	i = chunk_index;
+	while ((chunk_size = chunk_sizes [i++])) {
+		int amount = do_write (chunk_size, NO_SSE2, false);
+
+		graph_add_point (chunk_size, amount);
+	}
+
+	//------------------------------------------------------------
+	// Random non-SSE2 writes.
+	//
+#if defined(__x86_64__) || defined(__aarch64__)
+	graph_new_line ("Random 64-bit writes", RGB_GREEN);
+#else
+	graph_new_line ("Random 32-bit writes", RGB_GREEN);
+#endif
+
+	newline ();
+	srand (time (NULL));
+
+	i = chunk_index;
+	while ((chunk_size = chunk_sizes [i++])) {
+		int amount = do_write (chunk_size, NO_SSE2, true);
+
+		graph_add_point (chunk_size, amount);
+	}
+
+#if !defined(__arm__) && !defined(__aarch64__)
+	//------------------------------------------------------------
+	// SSE2 sequential copy.
+	//
+	if (use_sse2) {
+		graph_new_line ("Sequential 128-bit copy", 0x8f8844);
+
+		newline ();
+
+		i = 0;
+		while ((chunk_size = chunk_sizes [i++])) {
+			int amount = do_copy (chunk_size, SSE2);
+
+			graph_add_point (chunk_size, amount);
+		}
+	}
+#endif
+
+	//------------------------------------------------------------
+	// Register to register.
+	//
+	newline ();
+	register_test ();
+
+	//------------------------------------------------------------
+	// Stack to/from register.
+	//
+	newline ();
+	stack_test ();
+
+	//------------------------------------------------------------
+	// C library performance.
+	//
+	newline ();
+	library_test ();
+
+	//------------------------------------------------------------
+	// Framebuffer read & write.
+	//
+#if defined(__linux__) && defined(FBIOGET_FSCREENINFO)
+	newline ();
+	fb_readwrite (true);
+#endif
+
+#if defined(__WIN32__) && (defined(__arm__) || defined(__aarch64__))
+	MessageBoxW (0, msg, APPNAME, 0);
+
+	FILE *of = fopen ("bandwidth.log", "w");
+	if (of) {
+		dump (of);
+		fclose (of);
+	}
+#else
+	flush ();
+#endif
+
+	graph_make ();
+
+	BMP_write (graph, "bandwidth.bmp");
+	BMP_delete (graph);
+#if defined(__linux__) || defined(__CYGWIN__) || defined(__APPLE__)
+	puts ("\nWrote graph to bandwidth.bmp.");
+	puts ("");
+	puts ("Done.");
+#endif
+
+	return 0;
+}
diff --git a/make.inc b/make.inc
new file mode 100644
index 0000000..e8eccab
--- /dev/null
+++ b/make.inc
@@ -0,0 +1,65 @@
+##
+## unit_test/linux/benchmark/bandwidth/make.inc
+##
+## History:
+##    2012/05/31 - [Cao Rongrong] Created file
+##
+## Copyright (C) 2011-2015, Ambarella, Inc.
+##
+## All rights reserved. No Part of this file may be reproduced, stored
+## in a retrieval system, or transmitted, in any form, or by any means,
+## electronic, mechanical, photocopying, recording, or otherwise,
+## without the prior consent of Ambarella, Inc.
+##
+ifeq ($(BUILD_AMBARELLA_UNIT_TESTS_BANDWIDTH), y)
+
+LOCAL_PATH	:= $(call my-dir)
+
+###
+include $(CLEAR_VARS)
+
+LOCAL_TARGET	:= bandwidth-arm
+ifeq ($(CPU_ARCH), arm64)
+LOCAL_SRCS	:= $(LOCAL_PATH)/routinesARM64.S
+else
+LOCAL_SRCS	:= $(LOCAL_PATH)/routinesARM.S
+endif
+LOCAL_SRCS	+= $(LOCAL_PATH)/main.c $(LOCAL_PATH)/BMP.c
+LOCAL_CFLAGS	:= -O3
+
+include $(BUILD_APP)
+
+.PHONY: $(LOCAL_TARGET)
+
+$(LOCAL_TARGET): $(LOCAL_MODULE)
+	@mkdir -p $(UNIT_TEST_PATH)/
+	@cp -dpRf $< $(UNIT_TEST_PATH)/
+	@echo "Build $@ Done."
+
+$(call add-target-into-build, $(LOCAL_TARGET))
+
+###
+include $(CLEAR_VARS)
+
+LOCAL_TARGET	:= bandwidth-arm-thread
+ifeq ($(CPU_ARCH), arm64)
+LOCAL_SRCS	:= $(LOCAL_PATH)/routinesARM64.S
+else
+LOCAL_SRCS	:= $(LOCAL_PATH)/routinesARM.S
+endif
+LOCAL_SRCS	+= $(LOCAL_PATH)/main_thread.c $(LOCAL_PATH)/BMP.c
+LOCAL_CFLAGS	:= -O3
+LOCAL_LDFLAGS	:= -lpthread
+
+include $(BUILD_APP)
+
+.PHONY: $(LOCAL_TARGET)
+
+$(LOCAL_TARGET): $(LOCAL_MODULE)
+	@mkdir -p $(UNIT_TEST_PATH)/
+	@cp -dpRf $< $(UNIT_TEST_PATH)/
+	@echo "Build $@ Done."
+
+$(call add-target-into-build, $(LOCAL_TARGET))
+
+endif
diff --git a/routines32.asm b/routines32.asm
new file mode 100644
index 0000000..2f6f485
--- /dev/null
+++ b/routines32.asm
@@ -0,0 +1,1636 @@
+
+; ============================================================================
+;  bandwidth 0.23, a benchmark to estimate memory transfer bandwidth.
+;  Copyright (C) 2005-2010 by Zack T Smith.
+;
+;  This program is free software; you can redistribute it and/or modify
+;  it under the terms of the GNU General Public License as published by
+;  the Free Software Foundation; either version 2 of the License, or
+;  (at your option) any later version.
+;
+;  This program is distributed in the hope that it will be useful,
+;  but WITHOUT ANY WARRANTY; without even the implied warranty of
+;  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+;  GNU General Public License for more details.
+;
+;  You should have received a copy of the GNU General Public License
+;  along with this program; if not, write to the Free Software
+;  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+;
+;  The author may be reached at fbui@comcast.net.
+; =============================================================================
+
+bits	32
+cpu	prescott
+
+; Cygwin requires the underbar-prefixed symbols.
+global	_WriterSSE2
+global	WriterSSE2
+
+global	_ReaderSSE2
+global	ReaderSSE2
+
+global	_RandomReaderSSE2
+global	RandomReaderSSE2
+
+global	_WriterSSE2_bypass
+global	WriterSSE2_bypass
+
+global	_RandomWriterSSE2_bypass
+global	RandomWriterSSE2_bypass
+
+global	Reader
+global	_Reader
+
+global	Writer
+global	_Writer
+
+global	RandomReader
+global	_RandomReader
+
+global	RandomWriter
+global	_RandomWriter
+
+global	RandomWriterSSE2
+global	_RandomWriterSSE2
+
+global	has_sse2
+global	_has_sse2
+
+global	CopySSE
+global	_CopySSE
+
+global	RegisterToRegister
+global	_RegisterToRegister
+
+global	VectorToVector
+global	_VectorToVector
+
+global	RegisterToVector
+global	_RegisterToVector
+
+global	VectorToRegister
+global	_VectorToRegister
+
+global	Register8ToVector
+global	Register16ToVector
+global	Register32ToVector
+global	Register64ToVector
+global	Vector8ToRegister
+global	Vector16ToRegister
+global	Vector32ToRegister
+global	Vector64ToRegister
+
+global	_Register8ToVector
+global	_Register16ToVector
+global	_Register32ToVector
+global	_Register64ToVector
+global	_Vector8ToRegister
+global	_Vector16ToRegister
+global	_Vector32ToRegister
+global	_Vector64ToRegister
+
+global	StackReader
+global	_StackReader
+
+global	StackWriter
+global	_StackWriter
+
+	section .text
+
+;------------------------------------------------------------------------------
+; Name:		Reader
+; Purpose:	Reads 32-bit values sequentially from an area of memory.
+; Params:	
+;		[esp+4]	= ptr to memory area
+; 		[esp+8] = length in bytes
+; 		[esp+12] = loops
+;------------------------------------------------------------------------------
+Reader:
+_Reader:
+	push	ebx
+	push	ecx
+	push	edx
+
+	mov	ecx, [esp+12+12]	; loops to do.
+
+	mov	edx, [esp+4+12]	; ptr to memory chunk.
+	mov	ebx, edx	; ebx = limit in memory
+	add	ebx, [esp+8+12]
+
+.L1:
+	mov	edx, [esp+4+12]	
+
+.L2:
+	mov	eax, [edx]
+	mov	eax, [4+edx]
+	mov	eax, [8+edx]
+	mov	eax, [12+edx]
+	mov	eax, [16+edx]
+	mov	eax, [20+edx]
+	mov	eax, [24+edx]
+	mov	eax, [28+edx]
+	mov	eax, [32+edx]
+	mov	eax, [36+edx]
+	mov	eax, [40+edx]
+	mov	eax, [44+edx]
+	mov	eax, [48+edx]
+	mov	eax, [52+edx]
+	mov	eax, [56+edx]
+	mov	eax, [60+edx]
+	mov	eax, [64+edx]
+	mov	eax, [68+edx]
+	mov	eax, [72+edx]
+	mov	eax, [76+edx]
+	mov	eax, [80+edx]
+	mov	eax, [84+edx]
+	mov	eax, [88+edx]
+	mov	eax, [92+edx]
+	mov	eax, [96+edx]
+	mov	eax, [100+edx]
+	mov	eax, [104+edx]
+	mov	eax, [108+edx]
+	mov	eax, [112+edx]
+	mov	eax, [116+edx]
+	mov	eax, [120+edx]
+	mov	eax, [124+edx]
+
+	mov	eax, [edx+128]
+	mov	eax, [edx+132]
+	mov	eax, [edx+136]
+	mov	eax, [edx+140]
+	mov	eax, [edx+144]
+	mov	eax, [edx+148]
+	mov	eax, [edx+152]
+	mov	eax, [edx+156]
+	mov	eax, [edx+160]
+	mov	eax, [edx+164]
+	mov	eax, [edx+168]
+	mov	eax, [edx+172]
+	mov	eax, [edx+176]
+	mov	eax, [edx+180]
+	mov	eax, [edx+184]
+	mov	eax, [edx+188]
+	mov	eax, [edx+192]
+	mov	eax, [edx+196]
+	mov	eax, [edx+200]
+	mov	eax, [edx+204]
+	mov	eax, [edx+208]
+	mov	eax, [edx+212]
+	mov	eax, [edx+216]
+	mov	eax, [edx+220]
+	mov	eax, [edx+224]
+	mov	eax, [edx+228]
+	mov	eax, [edx+232]
+	mov	eax, [edx+236]
+	mov	eax, [edx+240]
+	mov	eax, [edx+244]
+	mov	eax, [edx+248]
+	mov	eax, [edx+252]
+
+	add	edx, 256
+	cmp	edx, ebx
+	jb	.L2
+
+	sub	ecx, 1
+	jnz	.L1
+
+	pop	edx
+	pop	ecx
+	pop	ebx
+	ret
+
+
+;------------------------------------------------------------------------------
+; Name:		Writer
+; Purpose:	Writes 32-bit value sequentially to an area of memory.
+; Params:	
+;		[esp+4]	= ptr to memory area
+; 		[esp+8] = length in bytes
+; 		[esp+12] = loops
+; 		[esp+16] = long to write
+;------------------------------------------------------------------------------
+Writer:
+_Writer:
+	push	ebx
+	push	ecx
+	push	edx
+
+	mov	ecx, [esp+12+12]
+	mov	eax, [esp+16+12]
+
+	mov	edx, [esp+4+12]	; edx = ptr to chunk
+	mov	ebx, edx
+	add	ebx, [esp+8+12]	; ebx = limit in memory
+
+.L1:
+	mov	edx, [esp+4+12]
+
+.L2:
+	mov	[edx], eax
+	mov	[4+edx], eax
+	mov	[8+edx], eax
+	mov	[12+edx], eax
+	mov	[16+edx], eax
+	mov	[20+edx], eax
+	mov	[24+edx], eax
+	mov	[28+edx], eax
+	mov	[32+edx], eax
+	mov	[36+edx], eax
+	mov	[40+edx], eax
+	mov	[44+edx], eax
+	mov	[48+edx], eax
+	mov	[52+edx], eax
+	mov	[56+edx], eax
+	mov	[60+edx], eax
+	mov	[64+edx], eax
+	mov	[68+edx], eax
+	mov	[72+edx], eax
+	mov	[76+edx], eax
+	mov	[80+edx], eax
+	mov	[84+edx], eax
+	mov	[88+edx], eax
+	mov	[92+edx], eax
+	mov	[96+edx], eax
+	mov	[100+edx], eax
+	mov	[104+edx], eax
+	mov	[108+edx], eax
+	mov	[112+edx], eax
+	mov	[116+edx], eax
+	mov	[120+edx], eax
+	mov	[124+edx], eax
+
+	mov	[edx+128], eax
+	mov	[edx+132], eax
+	mov	[edx+136], eax
+	mov	[edx+140], eax
+	mov	[edx+144], eax
+	mov	[edx+148], eax
+	mov	[edx+152], eax
+	mov	[edx+156], eax
+	mov	[edx+160], eax
+	mov	[edx+164], eax
+	mov	[edx+168], eax
+	mov	[edx+172], eax
+	mov	[edx+176], eax
+	mov	[edx+180], eax
+	mov	[edx+184], eax
+	mov	[edx+188], eax
+	mov	[edx+192], eax
+	mov	[edx+196], eax
+	mov	[edx+200], eax
+	mov	[edx+204], eax
+	mov	[edx+208], eax
+	mov	[edx+212], eax
+	mov	[edx+216], eax
+	mov	[edx+220], eax
+	mov	[edx+224], eax
+	mov	[edx+228], eax
+	mov	[edx+232], eax
+	mov	[edx+236], eax
+	mov	[edx+240], eax
+	mov	[edx+244], eax
+	mov	[edx+248], eax
+	mov	[edx+252], eax
+
+	add	edx, 256
+	cmp	edx, ebx
+	jb	.L2
+
+	sub	ecx, 1
+	jnz	.L1
+
+	pop	edx
+	pop	ecx
+	pop	ebx
+	ret
+
+
+;------------------------------------------------------------------------------
+; Name:		has_sse2
+; 
+has_sse2:
+_has_sse2:
+	push	ebx
+	push 	ecx
+	push 	edx
+	mov	eax, 1
+	cpuid
+	xor	eax, eax
+	test	edx, 0x4000000
+	setnz	al
+	pop	edx
+	pop	ecx
+	pop	ebx
+	ret
+
+
+;------------------------------------------------------------------------------
+; Name:		ReaderSSE2
+; Purpose:	Reads 128-bit values sequentially from an area of memory.
+; Params:	[esp+4] = ptr to memory area
+; 		[esp+8] = length in bytes
+; 		[esp+12] = loops
+;------------------------------------------------------------------------------
+ReaderSSE2:
+_ReaderSSE2:
+	push	ebx
+	push	ecx
+
+	mov	ecx, [esp+12+8]
+
+	mov	eax, [esp+4+8]
+	mov	ebx, eax
+	add	ebx, [esp+8+8]	; ebx points to end.
+
+.L1:
+	mov	eax, [esp+4+8]
+
+.L2:
+	movdqa	xmm0, [eax]	; Read aligned @ 16-byte boundary.
+	movdqa	xmm0, [16+eax]
+	movdqa	xmm0, [32+eax]
+	movdqa	xmm0, [48+eax]
+	movdqa	xmm0, [64+eax]
+	movdqa	xmm0, [80+eax]
+	movdqa	xmm0, [96+eax]
+	movdqa	xmm0, [112+eax]
+
+	movdqa	xmm0, [128+eax]
+	movdqa	xmm0, [144+eax]
+	movdqa	xmm0, [160+eax]
+	movdqa	xmm0, [176+eax]
+	movdqa	xmm0, [192+eax]
+	movdqa	xmm0, [208+eax]
+	movdqa	xmm0, [224+eax]
+	movdqa	xmm0, [240+eax]
+
+	add	eax, 256
+	cmp	eax, ebx
+	jb	.L2
+
+	sub	ecx, 1
+	jnz	.L1
+	
+	pop	ecx
+	pop	ebx
+	ret
+
+;------------------------------------------------------------------------------
+; Name:		WriterSSE2
+; Purpose:	Write 128-bit values sequentially from an area of memory.
+; Params:	[esp+4] = ptr to memory area
+; 		[esp+8] = length in bytes
+; 		[esp+12] = loops
+; 		[esp+16] = value (ignored)
+;------------------------------------------------------------------------------
+WriterSSE2:
+_WriterSSE2:
+	push	ebx
+	push	ecx
+
+	mov	eax, [esp+16+8]
+	movd	xmm0, eax	; Create a 128-bit replication of the 32-bit
+	movd	xmm1, eax	; value that was provided.
+	movd	xmm2, eax
+	movd	xmm3, eax
+	pslldq	xmm1, 32
+	pslldq	xmm2, 64
+	pslldq	xmm3, 96
+	por	xmm0, xmm1
+	por	xmm0, xmm2
+	por	xmm0, xmm3
+
+	mov	ecx, [esp+12+8]
+
+	mov	eax, [esp+4+8]
+	mov	ebx, eax
+	add	ebx, [esp+8+8]	; ebx points to end.
+
+.L1:
+	mov	eax, [esp+4+8]
+
+.L2:
+	movdqa	[eax], xmm0	
+	movdqa	[16+eax], xmm0
+	movdqa	[32+eax], xmm0
+	movdqa	[48+eax], xmm0
+	movdqa	[64+eax], xmm0
+	movdqa	[80+eax], xmm0
+	movdqa	[96+eax], xmm0
+	movdqa	[112+eax], xmm0
+
+	movdqa	[128+eax], xmm0
+	movdqa	[144+eax], xmm0
+	movdqa	[160+eax], xmm0
+	movdqa	[176+eax], xmm0
+	movdqa	[192+eax], xmm0
+	movdqa	[208+eax], xmm0
+	movdqa	[224+eax], xmm0
+	movdqa	[240+eax], xmm0
+
+	add	eax, 256
+	cmp	eax, ebx
+	jb	.L2
+
+	sub	ecx, 1
+	jnz	.L1
+	
+	pop	ecx
+	pop	ebx
+	ret
+
+;------------------------------------------------------------------------------
+; Name:		WriterSSE2_bypass
+; Purpose:	Write 128-bit values sequentially from an area of memory,
+;		bypassing the cache.
+; Params:	[esp+4] = ptr to memory area
+; 		[esp+8] = length in bytes
+; 		[esp+12] = loops
+; 		[esp+16] = value (ignored)
+;------------------------------------------------------------------------------
+WriterSSE2_bypass:
+_WriterSSE2_bypass:
+	push	ebx
+	push	ecx
+
+	mov	eax, [esp+16+8]
+	movd	xmm0, eax	; Create a 128-bit replication of the 32-bit
+	movd	xmm1, eax	; value that was provided.
+	movd	xmm2, eax
+	movd	xmm3, eax
+	pslldq	xmm1, 32
+	pslldq	xmm2, 64
+	pslldq	xmm3, 96
+	por	xmm0, xmm1
+	por	xmm0, xmm2
+	por	xmm0, xmm3
+
+	mov	ecx, [esp+12+8]
+
+	mov	eax, [esp+4+8]
+	mov	ebx, eax
+	add	ebx, [esp+8+8]	; ebx points to end.
+
+.L1:
+	mov	eax, [esp+4+8]
+
+.L2:
+	movntdq	[eax], xmm0	; Write bypassing cache.
+	movntdq	[16+eax], xmm0
+	movntdq	[32+eax], xmm0
+	movntdq	[48+eax], xmm0
+	movntdq	[64+eax], xmm0
+	movntdq	[80+eax], xmm0
+	movntdq	[96+eax], xmm0
+	movntdq	[112+eax], xmm0
+
+	movntdq	[128+eax], xmm0
+	movntdq	[144+eax], xmm0
+	movntdq	[160+eax], xmm0
+	movntdq	[176+eax], xmm0
+	movntdq	[192+eax], xmm0
+	movntdq	[208+eax], xmm0
+	movntdq	[224+eax], xmm0
+	movntdq	[240+eax], xmm0
+
+	add	eax, 256
+	cmp	eax, ebx
+	jb	.L2
+
+	sub	ecx, 1
+	jnz	.L1
+	
+	pop	ecx
+	pop	ebx
+	ret
+
+;------------------------------------------------------------------------------
+; Name:		RandomReader
+; Purpose:	Reads 32-bit values randomly from an area of memory.
+; Params:	
+;		[esp+4]	= ptr to array of chunk pointers
+; 		[esp+8] = # of 128-byte chunks
+; 		[esp+12] = loops
+;------------------------------------------------------------------------------
+RandomReader:
+_RandomReader:
+	push	ebx
+	push	ecx
+	push	edx
+
+	mov	ecx, [esp+12+12]	; loops to do.
+
+.L0:
+	mov	ebx, [esp+8+12]		; # chunks to do
+
+.L1:
+	sub	ebx, 1
+	jc	.L2
+
+	mov	edx, [esp+4+12]  	; get ptr to memory chunk.
+	mov	edx, [edx + 4*ebx]
+
+	mov	eax, [edx+160]
+	mov	eax, [edx+232]
+	mov	eax, [edx+224]
+	mov	eax, [96+edx]
+	mov	eax, [edx+164]
+	mov	eax, [76+edx]
+	mov	eax, [100+edx]
+	mov	eax, [edx+220]
+	mov	eax, [edx+248]
+	mov	eax, [104+edx]
+	mov	eax, [4+edx]
+	mov	eax, [edx+136]
+	mov	eax, [112+edx]
+	mov	eax, [edx+200]
+	mov	eax, [12+edx]
+	mov	eax, [edx+128]
+	mov	eax, [edx+148]
+	mov	eax, [edx+196]
+	mov	eax, [edx+216]
+	mov	eax, [edx]
+	mov	eax, [84+edx]
+	mov	eax, [edx+140]
+	mov	eax, [edx+204]
+	mov	eax, [edx+184]
+	mov	eax, [124+edx]
+	mov	eax, [48+edx]
+	mov	eax, [64+edx]
+	mov	eax, [edx+212]
+	mov	eax, [edx+240]
+	mov	eax, [edx+236]
+	mov	eax, [24+edx]
+	mov	eax, [edx+252]
+	mov	eax, [68+edx]
+	mov	eax, [20+edx]
+	mov	eax, [72+edx]
+	mov	eax, [32+edx]
+	mov	eax, [28+edx]
+	mov	eax, [52+edx]
+	mov	eax, [edx+244]
+	mov	eax, [edx+180]
+	mov	eax, [80+edx]
+	mov	eax, [60+edx]
+	mov	eax, [8+edx]
+	mov	eax, [56+edx]
+	mov	eax, [edx+208]
+	mov	eax, [edx+228]
+	mov	eax, [40+edx]
+	mov	eax, [edx+172]
+	mov	eax, [120+edx]
+	mov	eax, [edx+176]
+	mov	eax, [108+edx]
+	mov	eax, [edx+132]
+	mov	eax, [16+edx]
+	mov	eax, [44+edx]
+	mov	eax, [92+edx]
+	mov	eax, [edx+168]
+	mov	eax, [edx+152]
+	mov	eax, [edx+156]
+	mov	eax, [edx+188]
+	mov	eax, [36+edx]
+	mov	eax, [88+edx]
+	mov	eax, [116+edx]
+	mov	eax, [edx+192]
+	mov	eax, [edx+144]
+
+	jmp	.L1
+
+.L2:
+	sub	ecx, 1
+	jnz	.L0
+
+	pop	edx
+	pop	ecx
+	pop	ebx
+	ret
+
+
+;------------------------------------------------------------------------------
+; Name:		RandomReaderSSE2
+; Purpose:	Reads 128-bit values sequentially from an area of memory.
+; Params:	
+;		[esp+4]	= ptr to array of chunk pointers
+; 		[esp+8] = # of 128-byte chunks
+; 		[esp+12] = loops
+;------------------------------------------------------------------------------
+RandomReaderSSE2:
+_RandomReaderSSE2:
+	push	ebx
+	push	ecx
+	push	edx
+
+	mov	ecx, [esp+12+12]	; loops to do.
+
+.L0:
+	mov	ebx, [esp+8+12]		; # chunks to do
+
+.L1:
+	sub	ebx, 1
+	jc	.L2
+
+	mov	edx, [esp+4+12]  	; get ptr to memory chunk.
+	mov	edx, [edx + 4*ebx]
+
+; Read aligned @ 16-byte boundary.
+	movdqa	xmm0, [240+edx]
+	movdqa	xmm0, [128+edx]
+	movdqa	xmm0, [64+edx]
+	movdqa	xmm0, [208+edx]
+	movdqa	xmm0, [112+edx]
+	movdqa	xmm0, [176+edx]
+	movdqa	xmm0, [144+edx]
+	movdqa	xmm0, [edx]
+	movdqa	xmm0, [96+edx]
+	movdqa	xmm0, [16+edx]
+	movdqa	xmm0, [192+edx]
+	movdqa	xmm0, [160+edx]
+	movdqa	xmm0, [32+edx]
+	movdqa	xmm0, [48+edx]
+	movdqa	xmm0, [224+edx]
+	movdqa	xmm0, [80+edx]
+
+	jmp	.L1
+
+.L2:
+	sub	ecx, 1
+	jnz	.L0
+
+	pop	edx
+	pop	ecx
+	pop	ebx
+	ret
+
+
+;------------------------------------------------------------------------------
+; Name:		RandomWriter
+; Purpose:	Writes 32-bit value sequentially to an area of memory.
+; Params:	
+;		[esp+4]	= ptr to memory area
+; 		[esp+8] = length in bytes
+; 		[esp+12] = loops
+; 		[esp+16] = long to write
+;------------------------------------------------------------------------------
+RandomWriter:
+_RandomWriter:
+	push	ebx
+	push	ecx
+	push	edx
+
+	mov	eax, [esp+16+12]	; get datum.
+	mov	ecx, [esp+12+12]	; loops to do.
+
+.L0:
+	mov	ebx, [esp+8+12]		; # chunks to do
+
+.L1:
+	sub	ebx, 1
+	jc	.L2
+
+	mov	edx, [esp+4+12]  	; get ptr to memory chunk.
+	mov	edx, [edx + 4*ebx]
+
+	mov	[edx+212], eax
+	mov	[edx+156], eax
+	mov	[edx+132], eax
+	mov	[20+edx], eax
+	mov	[edx+172], eax
+	mov	[edx+196], eax
+	mov	[edx+248], eax
+	mov	[edx], eax
+	mov	[edx+136], eax
+	mov	[edx+228], eax
+	mov	[edx+160], eax
+	mov	[80+edx], eax
+	mov	[76+edx], eax
+	mov	[32+edx], eax
+	mov	[64+edx], eax
+	mov	[68+edx], eax
+	mov	[120+edx], eax
+	mov	[edx+216], eax
+	mov	[124+edx], eax
+	mov	[28+edx], eax
+	mov	[edx+152], eax
+	mov	[36+edx], eax
+	mov	[edx+220], eax
+	mov	[edx+188], eax
+	mov	[48+edx], eax
+	mov	[104+edx], eax
+	mov	[72+edx], eax
+	mov	[96+edx], eax
+	mov	[edx+184], eax
+	mov	[112+edx], eax
+	mov	[edx+236], eax
+	mov	[edx+224], eax
+	mov	[edx+252], eax
+	mov	[88+edx], eax
+	mov	[edx+180], eax
+	mov	[60+edx], eax
+	mov	[24+edx], eax
+	mov	[edx+192], eax
+	mov	[edx+164], eax
+	mov	[edx+204], eax
+	mov	[44+edx], eax
+	mov	[edx+168], eax
+	mov	[92+edx], eax
+	mov	[edx+208], eax
+	mov	[8+edx], eax
+	mov	[edx+144], eax
+	mov	[edx+148], eax
+	mov	[edx+128], eax
+	mov	[52+edx], eax
+	mov	[4+edx], eax
+	mov	[108+edx], eax
+	mov	[12+edx], eax
+	mov	[56+edx], eax
+	mov	[edx+200], eax
+	mov	[edx+232], eax
+	mov	[16+edx], eax
+	mov	[edx+244], eax
+	mov	[40+edx], eax
+	mov	[edx+140], eax
+	mov	[84+edx], eax
+	mov	[100+edx], eax
+	mov	[116+edx], eax
+	mov	[edx+176], eax
+	mov	[edx+240], eax
+
+	jmp	.L1
+
+.L2:
+	sub	ecx, 1
+	jnz	.L0
+
+	pop	edx
+	pop	ecx
+	pop	ebx
+	ret
+
+;------------------------------------------------------------------------------
+; Name:		RandomWriterSSE2
+; Purpose:	Writes 128-bit value randomly to an area of memory.
+; Params:	
+;		[esp+4]	= ptr to memory area
+; 		[esp+8] = length in bytes
+; 		[esp+12] = loops
+; 		[esp+16] = long to write
+;------------------------------------------------------------------------------
+RandomWriterSSE2:
+_RandomWriterSSE2:
+	push	ebx
+	push	ecx
+	push	edx
+
+	mov	eax, [esp+16+8]
+	movd	xmm0, eax	; Create a 128-bit replication of the 32-bit
+	movd	xmm1, eax	; value that was provided.
+	movd	xmm2, eax
+	movd	xmm3, eax
+	pslldq	xmm1, 32
+	pslldq	xmm2, 64
+	pslldq	xmm3, 96
+	por	xmm0, xmm1
+	por	xmm0, xmm2
+	por	xmm0, xmm3
+
+	mov	ecx, [esp+12+12]	; loops to do.
+
+.L0:
+	mov	ebx, [esp+8+12]		; # chunks to do
+
+.L1:
+	sub	ebx, 1
+	jc	.L2
+
+	mov	edx, [esp+4+12]  	; get ptr to memory chunk.
+	mov	edx, [edx + 4*ebx]
+
+	movdqa	[64+edx], xmm0
+	movdqa	[208+edx], xmm0
+	movdqa	[128+edx], xmm0
+	movdqa	[112+edx], xmm0
+	movdqa	[176+edx], xmm0
+	movdqa	[144+edx], xmm0
+	movdqa	[edx], xmm0
+	movdqa	[96+edx], xmm0
+	movdqa	[48+edx], xmm0
+	movdqa	[16+edx], xmm0
+	movdqa	[192+edx], xmm0
+	movdqa	[160+edx], xmm0
+	movdqa	[32+edx], xmm0
+	movdqa	[240+edx], xmm0
+	movdqa	[224+edx], xmm0
+	movdqa	[80+edx], xmm0
+
+	jmp	.L1
+
+.L2:
+	sub	ecx, 1
+	jnz	.L0
+
+	pop	edx
+	pop	ecx
+	pop	ebx
+	ret
+
+
+;------------------------------------------------------------------------------
+; Name:		RandomWriterSSE2_bypass
+; Purpose:	Writes 128-bit value randomly into memory, bypassing caches.
+; Params:	
+;		[esp+4]	= ptr to memory area
+; 		[esp+8] = length in bytes
+; 		[esp+12] = loops
+; 		[esp+16] = long to write
+;------------------------------------------------------------------------------
+RandomWriterSSE2_bypass:
+_RandomWriterSSE2_bypass:
+	push	ebx
+	push	ecx
+	push	edx
+
+	mov	eax, [esp+16+8]
+	movd	xmm0, eax	; Create a 128-bit replication of the 32-bit
+	movd	xmm1, eax	; value that was provided.
+	movd	xmm2, eax
+	movd	xmm3, eax
+	pslldq	xmm1, 32
+	pslldq	xmm2, 64
+	pslldq	xmm3, 96
+	por	xmm0, xmm1
+	por	xmm0, xmm2
+	por	xmm0, xmm3
+
+	mov	ecx, [esp+12+12]	; loops to do.
+
+.L0:
+	mov	ebx, [esp+8+12]		; # chunks to do
+
+.L1:
+	sub	ebx, 1
+	jc	.L2
+
+	mov	edx, [esp+4+12]  	; get ptr to memory chunk.
+	mov	edx, [edx + 4*ebx]
+
+	movntdq	[128+edx], xmm0
+	movntdq	[240+edx], xmm0
+	movntdq	[112+edx], xmm0
+	movntdq	[64+edx], xmm0
+	movntdq	[176+edx], xmm0
+	movntdq	[144+edx], xmm0
+	movntdq	[edx], xmm0
+	movntdq	[208+edx], xmm0
+	movntdq	[80+edx], xmm0
+	movntdq	[96+edx], xmm0
+	movntdq	[48+edx], xmm0
+	movntdq	[16+edx], xmm0
+	movntdq	[192+edx], xmm0
+	movntdq	[160+edx], xmm0
+	movntdq	[224+edx], xmm0
+	movntdq	[32+edx], xmm0
+
+	jmp	.L1
+
+.L2:
+	sub	ecx, 1
+	jnz	.L0
+
+	pop	edx
+	pop	ecx
+	pop	ebx
+	ret
+
+;------------------------------------------------------------------------------
+; Name:		RegisterToRegister
+; Purpose:	Reads/writes 32-bit values between registers of 
+;		the main register set.
+; Params:	
+; 		dword [esp+4] = loops
+;------------------------------------------------------------------------------
+RegisterToRegister:
+_RegisterToRegister:
+	push	ebx
+	push	ecx
+
+	mov	ecx, [esp+4+8]	; loops to do.
+
+.L1:
+	mov	eax, ebx	; 64 transfers by 4 bytes = 256 bytes
+	mov	eax, ecx
+	mov	eax, edx
+	mov	eax, esi
+	mov	eax, edi
+	mov	eax, ebp
+	mov	eax, esp
+	mov	eax, ebx
+	mov	eax, ebx
+	mov	eax, ecx
+	mov	eax, edx
+	mov	eax, esi
+	mov	eax, edi
+	mov	eax, ebp
+	mov	eax, esp
+	mov	eax, ebx
+	mov	eax, ebx
+	mov	eax, ecx
+	mov	eax, edx
+	mov	eax, esi
+	mov	eax, edi
+	mov	eax, ebp
+	mov	eax, esp
+	mov	eax, ebx
+	mov	eax, ebx
+	mov	eax, ecx
+	mov	eax, edx
+	mov	eax, esi
+	mov	eax, edi
+	mov	eax, ebp
+	mov	eax, esp
+	mov	eax, ebx
+
+	mov	ebx, eax
+	mov	ebx, ecx
+	mov	ebx, edx
+	mov	ebx, esi
+	mov	ebx, edi
+	mov	ebx, ebp
+	mov	ebx, esp
+	mov	ebx, eax
+	mov	ebx, eax
+	mov	ebx, ecx
+	mov	ebx, edx
+	mov	ebx, esi
+	mov	ebx, edi
+	mov	ebx, ebp
+	mov	ebx, esp
+	mov	ebx, eax
+	mov	ebx, eax
+	mov	ebx, ecx
+	mov	ebx, edx
+	mov	ebx, esi
+	mov	ebx, edi
+	mov	ebx, ebp
+	mov	ebx, esp
+	mov	ebx, eax
+	mov	ebx, eax
+	mov	ebx, ecx
+	mov	ebx, edx
+	mov	ebx, esi
+	mov	ebx, edi
+	mov	ebx, ebp
+	mov	ebx, esp
+	mov	ebx, eax
+
+	dec	ecx
+	jnz	.L1
+
+	pop	ecx
+	pop	ebx
+	ret
+
+;------------------------------------------------------------------------------
+; Name:		VectorToVector
+; Purpose:	Reads/writes 128-bit values between registers of 
+;		the vector register set, in this case XMM.
+;		(I don't have access to anything with YMM.)
+; Params:	dword [esp + 4] = count.
+;------------------------------------------------------------------------------
+VectorToVector:
+_VectorToVector:
+	mov	eax, [esp + 4]
+.L1:
+	movdqa	xmm0, xmm1
+	movdqa	xmm0, xmm2
+	movdqa	xmm0, xmm3
+	movdqa	xmm2, xmm0
+	movdqa	xmm1, xmm2
+	movdqa	xmm2, xmm1
+	movdqa	xmm0, xmm3
+	movdqa	xmm3, xmm1
+
+	movdqa	xmm3, xmm2
+	movdqa	xmm1, xmm3
+	movdqa	xmm2, xmm1
+	movdqa	xmm0, xmm1
+	movdqa	xmm1, xmm2
+	movdqa	xmm0, xmm1
+	movdqa	xmm0, xmm3
+	movdqa	xmm3, xmm0
+
+	dec	eax
+	jnz	.L1
+	ret
+
+;------------------------------------------------------------------------------
+; Name:		RegisterToVector
+; Purpose:	Writes 32-bit main register values into 128-bit vector register
+;		clearing the upper unused bits.
+; Params:	dword [esp + 4] = count.
+;------------------------------------------------------------------------------
+RegisterToVector:
+_RegisterToVector:
+	mov 	eax, [esp + 4]
+	add	eax, eax	; Double # of loops.
+.L1:
+	movd	xmm1, eax	; 32 transfers of 4 bytes = 128 bytes
+	movd	xmm2, eax
+	movd	xmm3, eax
+	movd	xmm0, eax
+	movd	xmm1, eax
+	movd	xmm2, eax
+	movd	xmm3, eax
+	movd	xmm0, eax
+
+	movd	xmm1, eax
+	movd	xmm3, eax
+	movd	xmm2, eax
+	movd	xmm0, eax
+	movd	xmm1, eax
+	movd	xmm2, eax
+	movd	xmm3, eax
+	movd	xmm0, eax
+
+	movd	xmm0, eax
+	movd	xmm2, eax
+	movd	xmm0, eax
+	movd	xmm3, eax
+	movd	xmm1, eax
+	movd	xmm3, eax
+	movd	xmm2, eax
+	movd	xmm0, eax
+
+	movd	xmm0, eax
+	movd	xmm3, eax
+	movd	xmm1, eax
+	movd	xmm2, eax
+	movd	xmm0, eax
+	movd	xmm2, eax
+	movd	xmm3, eax
+	movd	xmm0, eax
+
+	dec	eax
+	jnz	.L1
+	ret
+
+;------------------------------------------------------------------------------
+; Name:		VectorToRegister
+; Purpose:	Writes lowest 32 bits of vector registers into 32-bit main
+;		register.
+; Params:	dword [esp + 4] = count.
+;------------------------------------------------------------------------------
+VectorToRegister:
+_VectorToRegister:
+	mov 	eax, [esp + 4]
+	add	eax, eax	; Double # of loops.
+	push	ebx
+.L1:
+	movd	ebx, xmm1	; 4 bytes per transfer therefore need 64
+	movd	ebx, xmm2	; to transfer 256 bytes.
+	movd	ebx, xmm3
+	movd	ebx, xmm0
+	movd	ebx, xmm1
+	movd	ebx, xmm2
+	movd	ebx, xmm3
+	movd	ebx, xmm0
+
+	movd	ebx, xmm1
+	movd	ebx, xmm3
+	movd	ebx, xmm2
+	movd	ebx, xmm0
+	movd	ebx, xmm1
+	movd	ebx, xmm2
+	movd	ebx, xmm3
+	movd	ebx, xmm0
+
+	movd	ebx, xmm0
+	movd	ebx, xmm2
+	movd	ebx, xmm0
+	movd	ebx, xmm3
+	movd	ebx, xmm1
+	movd	ebx, xmm3
+	movd	ebx, xmm2
+	movd	ebx, xmm0
+
+	movd	ebx, xmm0
+	movd	ebx, xmm3
+	movd	ebx, xmm1
+	movd	ebx, xmm2
+	movd	ebx, xmm0
+	movd	ebx, xmm2
+	movd	ebx, xmm3
+	movd	ebx, xmm0
+
+	dec	eax
+	jnz	.L1
+
+	pop	ebx
+	ret
+
+;------------------------------------------------------------------------------
+; Name:		StackReader
+; Purpose:	Reads 32-bit values off the stack into registers of
+;		the main register set, effectively testing L1 cache access
+;		*and* effective-address calculation speed.
+; Params:	
+; 		dword [esp+4] = loops
+;------------------------------------------------------------------------------
+StackReader:
+_StackReader:
+	push	ebx
+	push	ecx
+
+	mov	ecx, [esp+4+8]	; loops to do.
+
+	push	dword 7000	; [esp+24]
+	push	dword 6000	; [esp+20]
+	push	dword 5000	; [esp+16]
+	push	dword 4000	; [esp+12]
+	push	dword 3000	; [esp+8]
+	push	dword 2000	; [esp+4]
+	push	dword 1000	; [esp]
+
+.L1:
+	mov	eax, [esp]
+	mov	eax, [esp+8]
+	mov	eax, [esp+12]
+	mov	eax, [esp+16]
+	mov	eax, [esp+20]
+	mov	eax, [esp+4]
+	mov	eax, [esp+24]
+	mov	eax, [esp]
+	mov	eax, [esp]
+	mov	eax, [esp+8]
+	mov	eax, [esp+12]
+	mov	eax, [esp+16]
+	mov	eax, [esp+20]
+	mov	eax, [esp+4]
+	mov	eax, [esp+24]
+	mov	eax, [esp]
+	mov	eax, [esp]
+	mov	eax, [esp+8]
+	mov	eax, [esp+12]
+	mov	eax, [esp+16]
+	mov	eax, [esp+20]
+	mov	eax, [esp+4]
+	mov	eax, [esp+24]
+	mov	eax, [esp+4]
+	mov	eax, [esp+4]
+	mov	eax, [esp+8]
+	mov	eax, [esp+12]
+	mov	eax, [esp+16]
+	mov	eax, [esp+20]
+	mov	eax, [esp+4]
+	mov	eax, [esp+24]
+	mov	eax, [esp+4]
+
+	mov	ebx, [esp]
+	mov	ebx, [esp+8]
+	mov	ebx, [esp+12]
+	mov	ebx, [esp+16]
+	mov	ebx, [esp+20]
+	mov	ebx, [esp+4]
+	mov	ebx, [esp+24]
+	mov	ebx, [esp]
+	mov	ebx, [esp]
+	mov	ebx, [esp+8]
+	mov	ebx, [esp+12]
+	mov	ebx, [esp+16]
+	mov	ebx, [esp+20]
+	mov	ebx, [esp+4]
+	mov	ebx, [esp+24]
+	mov	ebx, [esp]
+	mov	ebx, [esp]
+	mov	ebx, [esp+8]
+	mov	ebx, [esp+12]
+	mov	ebx, [esp+16]
+	mov	ebx, [esp+20]
+	mov	ebx, [esp+4]
+	mov	ebx, [esp+24]
+	mov	ebx, [esp+4]
+	mov	ebx, [esp+4]
+	mov	ebx, [esp+8]
+	mov	ebx, [esp+12]
+	mov	ebx, [esp+16]
+	mov	ebx, [esp+20]
+	mov	ebx, [esp+4]
+	mov	ebx, [esp+24]
+	mov	ebx, [esp+4]
+
+	dec	ecx
+	jnz	.L1
+
+	add	esp, 28
+
+	pop	ecx
+	pop	ebx
+	ret
+
+;------------------------------------------------------------------------------
+; Name:		StackWriter
+; Purpose:	Writes 32-bit values into the stack from registers of
+;		the main register set, effectively testing L1 cache access
+;		*and* effective-address calculation speed.
+; Params:	
+; 		dword [esp+4] = loops
+;------------------------------------------------------------------------------
+StackWriter:
+_StackWriter:
+	push	ebx
+	push	ecx
+
+	mov	ecx, [esp+4+8]	; loops to do.
+
+	push	dword 7000	; [esp+24]
+	push	dword 6000	; [esp+20]
+	push	dword 5000	; [esp+16]
+	push	dword 4000	; [esp+12]
+	push	dword 3000	; [esp+8]
+	push	dword 2000	; [esp+4]
+	push	dword 1000	; [esp]
+
+	xor	eax, eax
+	mov	ebx, 0xffffffff
+
+.L1:
+	mov	[esp], eax
+	mov	[esp+8], eax
+	mov	[esp+12], eax
+	mov	[esp+16], eax
+	mov	[esp+20], eax
+	mov	[esp+4], eax
+	mov	[esp+24], eax
+	mov	[esp], eax
+	mov	[esp], eax
+	mov	[esp+8], eax
+	mov	[esp+12], eax
+	mov	[esp+16], eax
+	mov	[esp+20], eax
+	mov	[esp+4], eax
+	mov	[esp+24], eax
+	mov	[esp], eax
+	mov	[esp], eax
+	mov	[esp+8], eax
+	mov	[esp+12], eax
+	mov	[esp+16], eax
+	mov	[esp+20], eax
+	mov	[esp+4], eax
+	mov	[esp+24], eax
+	mov	[esp+4], eax
+	mov	[esp+4], eax
+	mov	[esp+8], eax
+	mov	[esp+12], eax
+	mov	[esp+16], eax
+	mov	[esp+20], eax
+	mov	[esp+4], eax
+	mov	[esp+24], eax
+	mov	[esp+4], eax
+
+	mov	[esp], ebx
+	mov	[esp+8], ebx
+	mov	[esp+12], ebx
+	mov	[esp+16], ebx
+	mov	[esp+20], ebx
+	mov	[esp+4], ebx
+	mov	[esp+24], ebx
+	mov	[esp], ebx
+	mov	[esp], ebx
+	mov	[esp+8], ebx
+	mov	[esp+12], ebx
+	mov	[esp+16], ebx
+	mov	[esp+20], ebx
+	mov	[esp+4], ebx
+	mov	[esp+24], ebx
+	mov	[esp], ebx
+	mov	[esp], ebx
+	mov	[esp+8], ebx
+	mov	[esp+12], ebx
+	mov	[esp+16], ebx
+	mov	[esp+20], ebx
+	mov	[esp+4], ebx
+	mov	[esp+24], ebx
+	mov	[esp+4], ebx
+	mov	[esp+4], ebx
+	mov	[esp+8], ebx
+	mov	[esp+12], ebx
+	mov	[esp+16], ebx
+	mov	[esp+20], ebx
+	mov	[esp+4], ebx
+	mov	[esp+24], ebx
+	mov	[esp+4], ebx
+
+	sub	ecx, 1
+	jnz	.L1
+
+	add	esp, 28
+
+	pop	ecx
+	pop	ebx
+	ret
+
+;------------------------------------------------------------------------------
+; Name:		Register8ToVector
+; Purpose:	Writes 8-bit main register values into 128-bit vector register
+;		without clearing the unused bits.
+; Params:	dword [esp + 4]
+;------------------------------------------------------------------------------
+Register8ToVector:
+_Register8ToVector:
+	mov	eax, [esp + 4]
+	sal	eax, 4  	; Force some repetition.
+.L1:
+	pinsrb	xmm1, al, 0
+	pinsrb	xmm2, bl, 1
+	pinsrb	xmm3, cl, 2
+	pinsrb	xmm1, dl, 3
+	pinsrb	xmm2, al, 4
+	pinsrb	xmm3, bl, 5
+	pinsrb	xmm0, cl, 6
+	pinsrb	xmm0, dl, 7
+
+	pinsrb	xmm0, al, 0
+	pinsrb	xmm1, bl, 1
+	pinsrb	xmm2, cl, 2
+	pinsrb	xmm3, dl, 3
+	pinsrb	xmm3, al, 4
+	pinsrb	xmm2, bl, 5
+	pinsrb	xmm1, cl, 6
+	pinsrb	xmm0, dl, 7
+
+	dec	eax
+	jnz .L1
+	ret
+
+;------------------------------------------------------------------------------
+; Name:		Register16ToVector
+; Purpose:	Writes 16-bit main register values into 128-bit vector register
+;		without clearing the unused bits.
+; Params:	rdi = loops
+;------------------------------------------------------------------------------
+Register16ToVector:
+_Register16ToVector:
+	mov	eax, [esp + 4]
+	sal	eax, 3  	; Force some repetition.
+.L1:
+	pinsrw	xmm1, ax, 0
+	pinsrw	xmm2, bx, 1
+	pinsrw	xmm3, cx, 2
+	pinsrw	xmm1, dx, 3
+	pinsrw	xmm2, si, 4
+	pinsrw	xmm3, di, 5
+	pinsrw	xmm0, bp, 6
+	pinsrw	xmm0, sp, 7
+
+	pinsrw	xmm0, ax, 0
+	pinsrw	xmm1, bx, 1
+	pinsrw	xmm2, cx, 2
+	pinsrw	xmm3, dx, 3
+	pinsrw	xmm3, si, 4
+	pinsrw	xmm2, di, 5
+	pinsrw	xmm1, bp, 6
+	pinsrw	xmm0, sp, 7
+
+	dec	eax
+	jnz .L1
+	ret
+
+;------------------------------------------------------------------------------
+; Name:		Register32ToVector
+; Purpose:	Writes 32-bit main register values into 128-bit vector register
+;		without clearing the unused bits.
+; Params:	rdi = loops
+;------------------------------------------------------------------------------
+Register32ToVector:
+_Register32ToVector:
+	mov	eax, [esp + 4]
+	sal	eax, 2  	; Force some repetition.
+.L1:
+	pinsrd	xmm1, eax, 0	; Each xfer moves 4 bytes so to move 256 bytes
+	pinsrd	xmm2, ebx, 1	; we need 64 transfers.
+	pinsrd	xmm3, ecx, 2
+	pinsrd	xmm1, edx, 3
+	pinsrd	xmm2, esi, 0
+	pinsrd	xmm3, edi, 1
+	pinsrd	xmm0, ebp, 2
+	pinsrd	xmm0, esp, 3
+
+	pinsrd	xmm0, eax, 0
+	pinsrd	xmm1, ebx, 1
+	pinsrd	xmm2, ecx, 2
+	pinsrd	xmm3, edx, 3
+	pinsrd	xmm3, esi, 3
+	pinsrd	xmm2, edi, 2
+	pinsrd	xmm1, ebp, 1
+	pinsrd	xmm0, esp, 0
+
+	dec	eax
+	jnz .L1
+	ret
+
+;------------------------------------------------------------------------------
+; Name:		Register64ToVector
+; Purpose:	Writes 64-bit main register values into 128-bit vector register
+;		without clearing the unused bits.
+; Params:	rdi = loops
+;------------------------------------------------------------------------------
+Register64ToVector:
+_Register64ToVector:
+	; There are no 64-bit registers on x86.
+	ret
+
+
+;------------------------------------------------------------------------------
+; Name:		Vector8ToRegister
+; Purpose:	Writes 8-bit vector register values into main register.
+; Params:	rdi = loops
+;------------------------------------------------------------------------------
+Vector8ToRegister:
+_Vector8ToRegister:
+	mov	eax, [esp + 4]
+	sal	eax, 4  	; Force some repetition.
+	push 	ebx
+.L1:
+	pextrb	ebx, xmm1, 0
+	pextrb	ebx, xmm2, 1
+	pextrb	ebx, xmm3, 2
+	pextrb	ebx, xmm1, 3
+	pextrb	ebx, xmm2, 4
+	pextrb	ebx, xmm3, 5
+	pextrb	ebx, xmm0, 6
+	pextrb	ebx, xmm0, 7
+
+	pextrb	ebx, xmm0, 0
+	pextrb	ebx, xmm1, 1
+	pextrb	ebx, xmm2, 2
+	pextrb	ebx, xmm3, 3
+	pextrb	ebx, xmm3, 4
+	pextrb	ebx, xmm2, 5
+	pextrb	ebx, xmm1, 6
+	pextrb	ebx, xmm0, 7
+
+	dec	eax
+	jnz .L1
+	pop	ebx
+	ret
+
+;------------------------------------------------------------------------------
+; Name:		Vector16ToRegister
+; Purpose:	Writes 16-bit vector register values into main register.
+; Params:	rdi = loops
+;------------------------------------------------------------------------------
+Vector16ToRegister:
+_Vector16ToRegister:
+	mov	eax, [esp + 4]
+	sal	eax, 3  	; Force some repetition.
+	push 	ebx
+.L1:
+	pextrw	ebx, xmm1, 0	; 256 byte chunk / 2 bytes/xfer = 128 xfers.
+	pextrw	ebx, xmm2, 1
+	pextrw	ebx, xmm3, 2
+	pextrw	ebx, xmm1, 3
+	pextrw	ebx, xmm2, 4
+	pextrw	ebx, xmm3, 5
+	pextrw	ebx, xmm0, 6
+	pextrw	ebx, xmm0, 7
+
+	pextrw	ebx, xmm0, 0
+	pextrw	ebx, xmm1, 1
+	pextrw	ebx, xmm2, 2
+	pextrw	ebx, xmm3, 3
+	pextrw	ebx, xmm3, 4
+	pextrw	ebx, xmm2, 5
+	pextrw	ebx, xmm1, 6
+	pextrw	ebx, xmm0, 7
+
+	dec	eax
+	jnz .L1
+	pop	ebx
+	ret
+
+;------------------------------------------------------------------------------
+; Name:		Vector32ToRegister
+; Purpose:	Writes 32-bit vector register values into main register.
+; Params:	rdi = loops
+;------------------------------------------------------------------------------
+Vector32ToRegister:
+_Vector32ToRegister:
+	mov	eax, [esp + 4]
+	sal	eax, 2  	; Force some repetition.
+	push 	ebx
+.L1:
+	pextrd	ebx, xmm1, 0	; 256 byte chunk / 4 bytes/xfer = 64 xfers.
+	pextrd	ebx, xmm2, 1
+	pextrd	ebx, xmm3, 2
+	pextrd	ebx, xmm1, 3
+	pextrd	ebx, xmm2, 0
+	pextrd	ebx, xmm3, 1
+	pextrd	ebx, xmm0, 2
+	pextrd	ebx, xmm0, 3
+
+	pextrd	ebx, xmm0, 0
+	pextrd	ebx, xmm1, 1
+	pextrd	ebx, xmm2, 2
+	pextrd	ebx, xmm3, 3
+	pextrd	ebx, xmm3, 3
+	pextrd	ebx, xmm2, 2
+	pextrd	ebx, xmm1, 1
+	pextrd	ebx, xmm0, 0
+
+	dec	eax
+	jnz .L1
+	pop	ebx
+	ret
+
+;------------------------------------------------------------------------------
+; Name:		Vector64ToRegister
+; Purpose:	Writes 64-bit vector register values into main register.
+; Params:	rdi = loops
+;------------------------------------------------------------------------------
+Vector64ToRegister:
+_Vector64ToRegister:
+	; There are no 64-bit registers on x86.
+	ret
+
+;------------------------------------------------------------------------------
+; Name:		CopySSE
+; Purpose:	Copies memory chunks that are 16-byte aligned.
+; Params:	[esp + 4]	= ptr to destination memory area
+;		[esp + 8]	= ptr to source memory area
+; 		[esp + 12]	= length in bytes
+; 		[esp + 16]	= loops
+;------------------------------------------------------------------------------
+CopySSE:
+_CopySSE:
+	; Register usage:
+	; esi = source
+	; edi = dest
+	; ecx = loops
+	; edx = length
+	push	esi
+	push	edi
+	push	ecx
+	push	edx
+
+	mov	edi, [esp + 4 + 16]
+	mov	esi, [esp + 8 + 16]
+	mov	edx, [esp + 12 + 16]
+	mov	ecx, [esp + 16 + 16]
+
+	shr	edx, 7	; Ensure length is multiple of 128.
+	shl	edx, 7
+
+	; Save our non-parameter XMM registers.
+	sub	esp, 64
+	movdqu	[esp], xmm4
+	movdqu	[16+esp], xmm5
+	movdqu	[32+esp], xmm6
+	movdqu	[48+esp], xmm7
+
+.L1:
+	mov	eax, edx
+
+.L2:
+	; prefetchnta	[esi]
+	movdqa	xmm0, [esi]
+	movdqa	xmm1, [16+esi]
+	movdqa	xmm2, [32+esi]
+	movdqa	xmm3, [48+esi]
+	movdqa	xmm4, [64+esi]
+	movdqa	xmm5, [80+esi]
+	movdqa	xmm6, [96+esi]
+	movdqa	xmm7, [112+esi]
+
+	movntdq	[edi], xmm0
+	movntdq	[16+edi], xmm1
+	movntdq	[32+edi], xmm2
+	movntdq	[48+edi], xmm3
+	movntdq	[64+edi], xmm4
+	movntdq	[80+edi], xmm5
+	movntdq	[96+edi], xmm6
+	movntdq	[112+edi], xmm7
+
+	add	esi, 128
+	add	edi, 128
+
+	sub	eax, 128
+	jnz	.L2
+
+	sub	esi, edx	; rsi now points to start.
+	sub	edi, edx	; rdi now points to start.
+
+	dec	ecx
+	jnz	.L1
+
+	movdqu	xmm4, [0+esp]
+	movdqu	xmm5, [16+esp]
+	movdqu	xmm6, [32+esp]
+	movdqu	xmm7, [48+esp]
+	add	esp, 64
+
+	pop	edx
+	pop	ecx
+	pop	edi
+	pop	esi
+	ret
diff --git a/routines64.asm b/routines64.asm
new file mode 100644
index 0000000..18e8f6e
--- /dev/null
+++ b/routines64.asm
@@ -0,0 +1,1516 @@
+
+; ============================================================================
+;  bandwidth 0.23, a benchmark to estimate memory transfer bandwidth.
+;  Copyright (C) 2005-2010 by Zack T Smith.
+;
+;  This program is free software; you can redistribute it and/or modify
+;  it under the terms of the GNU General Public License as published by
+;  the Free Software Foundation; either version 2 of the License, or
+;  (at your option) any later version.
+;
+;  This program is distributed in the hope that it will be useful,
+;  but WITHOUT ANY WARRANTY; without even the implied warranty of
+;  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+;  GNU General Public License for more details.
+;
+;  You should have received a copy of the GNU General Public License
+;  along with this program; if not, write to the Free Software
+;  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+;
+;  The author may be reached at fbui@comcast.net.
+; =============================================================================
+
+bits	64
+cpu	x64
+
+global	Reader
+global	RandomReader
+global	ReaderSSE2
+global	RandomReaderSSE2
+
+global	Writer
+global	RandomWriter
+global	WriterSSE2
+global	RandomWriterSSE2
+
+global	WriterSSE2_bypass
+global	RandomWriterSSE2_bypass
+
+global	CopySSE
+global	_CopySSE
+
+global	has_sse2
+
+global	RegisterToRegister
+global	RegisterToVector
+global	VectorToRegister
+global	VectorToVector
+
+global	Register8ToVector
+global	Register16ToVector
+global	Register32ToVector
+global	Register64ToVector
+global	Vector8ToRegister
+global	Vector16ToRegister
+global	Vector32ToRegister
+global	Vector64ToRegister
+
+global	StackReader
+global	StackWriter
+
+global	_Reader
+global	_RandomReader
+global	_ReaderSSE2
+global	_RandomReaderSSE2
+
+global	_Writer
+global	_RandomWriter
+global	_WriterSSE2
+global	_RandomWriterSSE2
+
+global	_WriterSSE2_bypass
+global	_RandomWriterSSE2_bypass
+
+global	_has_sse2
+
+global	_RegisterToRegister
+global	_RegisterToVector
+global	_VectorToRegister
+global	_VectorToVector
+
+global	_Register8ToVector
+global	_Register16ToVector
+global	_Register32ToVector
+global	_Register64ToVector
+global	_Vector8ToRegister
+global	_Vector16ToRegister
+global	_Vector32ToRegister
+global	_Vector64ToRegister
+
+global	_StackReader
+global	_StackWriter
+
+; Note:
+; Unix ABI says integer param are put in these registers in this order:
+;	rdi, rsi, rdx, rcx, r8, r9
+
+	section .text
+
+;------------------------------------------------------------------------------
+; Name:		has_sse2
+; 
+has_sse2:
+_has_sse2:
+	push	rbx
+	push 	rcx
+	push 	rdx
+	mov	rax, 1
+	cpuid
+	test	rdx, 0x4000000
+	setnz	al
+	pop	rdx
+	pop	rcx
+	pop	rbx
+	ret
+
+;------------------------------------------------------------------------------
+; Name:		Reader
+; Purpose:	Reads 64-bit values sequentially from an area of memory.
+; Params:	rdi = ptr to memory area
+; 		rsi = length in bytes
+; 		rdx = loops
+;------------------------------------------------------------------------------
+Reader:
+_Reader:
+	push	r10
+
+	add	rsi, rdi	; rdi now points to end.
+
+.L1:
+	mov	r10, rdi
+
+.L2:
+	mov	rax, [r10]
+	mov	rax, [8+r10]
+	mov	rax, [16+r10]
+	mov	rax, [24+r10]
+	mov	rax, [32+r10]
+	mov	rax, [40+r10]
+	mov	rax, [48+r10]
+	mov	rax, [56+r10]
+	mov	rax, [64+r10]
+	mov	rax, [72+r10]
+	mov	rax, [80+r10]
+	mov	rax, [88+r10]
+	mov	rax, [96+r10]
+	mov	rax, [104+r10]
+	mov	rax, [112+r10]
+	mov	rax, [120+r10]
+	mov	rax, [128+r10]
+	mov	rax, [136+r10]
+	mov	rax, [144+r10]
+	mov	rax, [152+r10]
+	mov	rax, [160+r10]
+	mov	rax, [168+r10]
+	mov	rax, [176+r10]
+	mov	rax, [184+r10]
+	mov	rax, [192+r10]
+	mov	rax, [200+r10]
+	mov	rax, [208+r10]
+	mov	rax, [216+r10]
+	mov	rax, [224+r10]
+	mov	rax, [232+r10]
+	mov	rax, [240+r10]
+	mov	rax, [248+r10]
+
+	add	r10, 256
+	cmp	r10, rsi
+	jb	.L2
+
+	dec	rdx
+	jnz	.L1
+
+	pop	r10
+	ret
+
+;------------------------------------------------------------------------------
+; Name:		RandomReader
+; Purpose:	Reads 64-bit values randomly from an area of memory.
+; Params:	rdi = ptr to array of chunk pointers
+; 		rsi = # of chunks
+; 		rdx = loops
+;------------------------------------------------------------------------------
+RandomReader:
+_RandomReader:
+	push	r10
+	push	r11
+
+.L1:
+	xor	r11, r11
+
+.L2:
+	mov	r10, [rdi + 8*r11]	; Note, 64-bit pointers.
+
+	mov	rax, [96+r10]
+	mov	rax, [r10]
+	mov	rax, [120+r10]
+	mov	rax, [184+r10]
+	mov	rax, [160+r10]
+	mov	rax, [176+r10]
+	mov	rax, [112+r10]
+	mov	rax, [80+r10]
+	mov	rax, [32+r10]
+	mov	rax, [128+r10]
+	mov	rax, [88+r10]
+	mov	rax, [40+r10]
+	mov	rax, [48+r10]
+	mov	rax, [72+r10]
+	mov	rax, [200+r10]
+	mov	rax, [24+r10]
+	mov	rax, [152+r10]
+	mov	rax, [16+r10]
+	mov	rax, [248+r10]
+	mov	rax, [56+r10]
+	mov	rax, [240+r10]
+	mov	rax, [208+r10]
+	mov	rax, [104+r10]
+	mov	rax, [216+r10]
+	mov	rax, [136+r10]
+	mov	rax, [232+r10]
+	mov	rax, [64+r10]
+	mov	rax, [224+r10]
+	mov	rax, [144+r10]
+	mov	rax, [192+r10]
+	mov	rax, [8+r10]
+	mov	rax, [168+r10]
+
+	inc	r11
+	cmp	r11, rsi
+	jb	.L2
+
+	dec	rdx
+	jnz	.L1
+
+	pop	r11
+	pop	r10
+	ret
+
+;------------------------------------------------------------------------------
+; Name:		RandomReaderSSE2
+; Purpose:	Reads 128-bit values randomly from an area of memory.
+; Params:	rdi = ptr to array of chunk pointers
+; 		rsi = # of chunks
+; 		rdx = loops
+;------------------------------------------------------------------------------
+RandomReaderSSE2:
+_RandomReaderSSE2:
+	push	r10
+	push	r11
+
+.L1:
+	xor	r11, r11
+
+.L2:
+	mov	r10, [rdi + 8*r11]
+
+	movdqa	xmm0, [240+r10]
+	movdqa	xmm0, [128+r10]
+	movdqa	xmm0, [64+r10]
+	movdqa	xmm0, [208+r10]
+	movdqa	xmm0, [112+r10]
+	movdqa	xmm0, [176+r10]
+	movdqa	xmm0, [144+r10]
+	movdqa	xmm0, [r10]
+	movdqa	xmm0, [96+r10]
+	movdqa	xmm0, [16+r10]
+	movdqa	xmm0, [192+r10]
+	movdqa	xmm0, [160+r10]
+	movdqa	xmm0, [32+r10]
+	movdqa	xmm0, [48+r10]
+	movdqa	xmm0, [224+r10]
+	movdqa	xmm0, [80+r10]
+
+	inc	r11
+	cmp	r11, rsi
+	jb	.L2
+
+	dec	rdx
+	jnz	.L1
+
+	pop	r11
+	pop	r10
+	ret
+
+;------------------------------------------------------------------------------
+; Name:		RandomWriter
+; Purpose:	Writes 64-bit values randomly to an area of memory.
+; Params:	rdi = ptr to array of chunk pointers
+; 		rsi = # of chunks
+; 		rdx = loops
+; 		rcx = datum to write
+;------------------------------------------------------------------------------
+RandomWriter:
+_RandomWriter:
+	push	r10
+	push	r11
+
+.L1:
+	xor	r11, r11
+
+.L2:
+	mov	r10, [rdi + 8*r11]	; Note, 64-bit pointers.
+
+	mov	[96+r10], rcx
+	mov	[r10], rcx
+	mov	[120+r10], rcx
+	mov	[184+r10], rcx
+	mov	[160+r10], rcx
+	mov	[176+r10], rcx
+	mov	[112+r10], rcx
+	mov	[80+r10], rcx
+	mov	[32+r10], rcx
+	mov	[128+r10], rcx
+	mov	[88+r10], rcx
+	mov	[40+r10], rcx
+	mov	[48+r10], rcx
+	mov	[72+r10], rcx
+	mov	[200+r10], rcx
+	mov	[24+r10], rcx
+	mov	[152+r10], rcx
+	mov	[16+r10], rcx
+	mov	[248+r10], rcx
+	mov	[56+r10], rcx
+	mov	[240+r10], rcx
+	mov	[208+r10], rcx
+	mov	[104+r10], rcx
+	mov	[216+r10], rcx
+	mov	[136+r10], rcx
+	mov	[232+r10], rcx
+	mov	[64+r10], rcx
+	mov	[224+r10], rcx
+	mov	[144+r10], rcx
+	mov	[192+r10], rcx
+	mov	[8+r10], rcx
+	mov	[168+r10], rcx
+
+	inc	r11
+	cmp	r11, rsi
+	jb	.L2
+
+	dec	rdx
+	jnz	.L1
+
+	pop	r11
+	pop	r10
+	ret
+
+;------------------------------------------------------------------------------
+; Name:		RandomWriterSSE2
+; Purpose:	Writes 128-bit values randomly to an area of memory.
+; Params:	rdi = ptr to array of chunk pointers
+; 		rsi = # of chunks
+; 		rdx = loops
+; 		rcx = datum to write
+;------------------------------------------------------------------------------
+RandomWriterSSE2:
+_RandomWriterSSE2:
+	push	r10
+	push	r11
+
+	movq	xmm0, rcx	; Create duplicated 128-bit datum
+	movq	xmm1, rcx
+	pslldq	xmm1, 64
+	por	xmm0, xmm1
+
+.L1:
+	xor	r11, r11
+
+.L2:
+	mov	r10, [rdi + 8*r11]	; Note, 64-bit pointers.
+
+	movdqa	[240+r10], xmm0
+	movdqa	[128+r10], xmm0
+	movdqa	[208+r10], xmm0
+	movdqa	[112+r10], xmm0
+	movdqa	[64+r10], xmm0
+	movdqa	[176+r10], xmm0
+	movdqa	[144+r10], xmm0
+	movdqa	[r10], xmm0
+	movdqa	[96+r10], xmm0
+	movdqa	[16+r10], xmm0
+	movdqa	[192+r10], xmm0
+	movdqa	[160+r10], xmm0
+	movdqa	[32+r10], xmm0
+	movdqa	[48+r10], xmm0
+	movdqa	[224+r10], xmm0
+	movdqa	[80+r10], xmm0
+
+	inc	r11
+	cmp	r11, rsi
+	jb	.L2
+
+	dec	rdx
+	jnz	.L1
+
+	pop	r11
+	pop	r10
+	ret
+
+;------------------------------------------------------------------------------
+; Name:		RandomWriterSSE2_bypass
+; Purpose:	Writes 128-bit values randomly into memory, bypassing caches.
+; Params:	rdi = ptr to array of chunk pointers
+; 		rsi = # of chunks
+; 		rdx = loops
+; 		rcx = datum to write
+;------------------------------------------------------------------------------
+RandomWriterSSE2_bypass:
+_RandomWriterSSE2_bypass:
+	push	r10
+	push	r11
+
+	movq	xmm0, rcx	; Create duplicated 128-bit datum
+	movq	xmm1, rcx
+	pslldq	xmm1, 64
+	por	xmm0, xmm1
+
+.L1:
+	xor	r11, r11
+
+.L2:
+	mov	r10, [rdi + 8*r11]	; Note, 64-bit pointers.
+
+	movntdq	[240+r10], xmm0
+	movntdq	[128+r10], xmm0
+	movntdq	[208+r10], xmm0
+	movntdq	[112+r10], xmm0
+	movntdq	[64+r10], xmm0
+	movntdq	[176+r10], xmm0
+	movntdq	[144+r10], xmm0
+	movntdq	[r10], xmm0
+	movntdq	[96+r10], xmm0
+	movntdq	[16+r10], xmm0
+	movntdq	[192+r10], xmm0
+	movntdq	[160+r10], xmm0
+	movntdq	[32+r10], xmm0
+	movntdq	[48+r10], xmm0
+	movntdq	[224+r10], xmm0
+	movntdq	[80+r10], xmm0
+
+	inc	r11
+	cmp	r11, rsi
+	jb	.L2
+
+	dec	rdx
+	jnz	.L1
+
+	pop	r11
+	pop	r10
+	ret
+
+;------------------------------------------------------------------------------
+; Name:		ReaderSSE2
+; Purpose:	Reads 128-bit values sequentially from an area of memory.
+; Params:	rdi = ptr to memory area
+; 		rsi = length in bytes
+; 		rdx = loops
+;------------------------------------------------------------------------------
+ReaderSSE2:
+_ReaderSSE2:
+	push	r10
+
+	add	rsi, rdi	; rsi now points to end.
+
+.L1:
+	mov	r10, rdi
+
+.L2:
+	movdqa	xmm0, [r10]	; Read aligned to 16-byte boundary.
+	movdqa	xmm0, [16+r10]
+	movdqa	xmm0, [32+r10]
+	movdqa	xmm0, [48+r10]
+	movdqa	xmm0, [64+r10]
+	movdqa	xmm0, [80+r10]
+	movdqa	xmm0, [96+r10]
+	movdqa	xmm0, [112+r10]
+
+	movdqa	xmm0, [128+r10]
+	movdqa	xmm0, [144+r10]
+	movdqa	xmm0, [160+r10]
+	movdqa	xmm0, [176+r10]
+	movdqa	xmm0, [192+r10]
+	movdqa	xmm0, [208+r10]
+	movdqa	xmm0, [224+r10]
+	movdqa	xmm0, [240+r10]
+
+	add	r10, 256
+	cmp	r10, rsi
+	jb	.L2
+
+	dec	rdx
+	jnz	.L1
+	
+	pop	r10
+	ret
+
+
+;------------------------------------------------------------------------------
+; Name:		Writer
+; Purpose:	Writes 64-bit value sequentially to an area of memory.
+; Params:	rdi = ptr to memory area
+; 		rsi = length in bytes
+; 		rdx = loops
+; 		rcx = quad to write
+;------------------------------------------------------------------------------
+Writer:
+_Writer:
+	push	r10
+
+	add	rsi, rdi	; rsi now points to end.
+
+.L1:
+	mov	r10, rdi
+
+.L2:
+	mov	[r10], rcx
+	mov	[8+r10], rcx
+	mov	[16+r10], rcx
+	mov	[24+r10], rcx
+	mov	[32+r10], rcx
+	mov	[40+r10], rcx
+	mov	[48+r10], rcx
+	mov	[56+r10], rcx
+	mov	[64+r10], rcx
+	mov	[72+r10], rcx
+	mov	[80+r10], rcx
+	mov	[88+r10], rcx
+	mov	[96+r10], rcx
+	mov	[104+r10], rcx
+	mov	[112+r10], rcx
+	mov	[120+r10], rcx
+	mov	[128+r10], rcx
+	mov	[136+r10], rcx
+	mov	[144+r10], rcx
+	mov	[152+r10], rcx
+	mov	[160+r10], rcx
+	mov	[168+r10], rcx
+	mov	[176+r10], rcx
+	mov	[184+r10], rcx
+	mov	[192+r10], rcx
+	mov	[200+r10], rcx
+	mov	[208+r10], rcx
+	mov	[216+r10], rcx
+	mov	[224+r10], rcx
+	mov	[232+r10], rcx
+	mov	[240+r10], rcx
+	mov	[248+r10], rcx
+
+	add	r10, 256
+	cmp	r10, rsi
+	jb	.L2
+
+	dec	rdx
+	jnz	.L1
+
+	pop	r10
+	ret
+
+;------------------------------------------------------------------------------
+; Name:		WriterSSE2
+; Purpose:	Writes 128-bit value sequentially to an area of memory.
+; Params:	rdi = ptr to memory area
+; 		rsi = length in bytes
+; 		rdx = loops
+; 		rcx = quad to write
+;------------------------------------------------------------------------------
+WriterSSE2:
+_WriterSSE2:
+	push	r10
+
+	add	rsi, rdi	; rsi now points to end.
+
+	movq	xmm0, rcx
+
+.L1:
+	mov	r10, rdi
+
+.L2:
+	movdqa	[r10], xmm0
+	movdqa	[16+r10], xmm0
+	movdqa	[32+r10], xmm0
+	movdqa	[48+r10], xmm0
+	movdqa	[64+r10], xmm0
+	movdqa	[80+r10], xmm0
+	movdqa	[96+r10], xmm0
+	movdqa	[112+r10], xmm0
+
+	movdqa	[128+r10], xmm0
+	movdqa	[144+r10], xmm0
+	movdqa	[160+r10], xmm0
+	movdqa	[176+r10], xmm0
+	movdqa	[192+r10], xmm0
+	movdqa	[208+r10], xmm0
+	movdqa	[224+r10], xmm0
+	movdqa	[240+r10], xmm0
+
+	add	r10, 256
+	cmp	r10, rsi
+	jb	.L2
+
+	dec	rdx
+	jnz	.L1
+
+	pop	r10
+	ret
+
+;------------------------------------------------------------------------------
+; Name:		WriterSSE2_bypass
+; Purpose:	Writes 128-bit value sequentially to an area of memory.
+; Params:	rdi = ptr to memory area
+; 		rsi = length in bytes
+; 		rdx = loops
+; 		rcx = quad to write
+;------------------------------------------------------------------------------
+WriterSSE2_bypass:
+_WriterSSE2_bypass:
+	push	r10
+
+	add	rsi, rdi	; rsi now points to end.
+
+	movq	xmm0, rcx
+
+.L1:
+	mov	r10, rdi
+
+.L2:
+	movntdq	[r10], xmm0	; Write bypassing cache.
+	movntdq	[16+r10], xmm0
+	movntdq	[32+r10], xmm0
+	movntdq	[48+r10], xmm0
+	movntdq	[64+r10], xmm0
+	movntdq	[80+r10], xmm0
+	movntdq	[96+r10], xmm0
+	movntdq	[112+r10], xmm0
+
+	movntdq	[128+r10], xmm0
+	movntdq	[144+r10], xmm0
+	movntdq	[160+r10], xmm0
+	movntdq	[176+r10], xmm0
+	movntdq	[192+r10], xmm0
+	movntdq	[208+r10], xmm0
+	movntdq	[224+r10], xmm0
+	movntdq	[240+r10], xmm0
+
+	add	r10, 256
+	cmp	r10, rsi
+	jb	.L2
+
+	dec	rdx
+	jnz	.L1
+
+	pop	r10
+	ret
+
+;------------------------------------------------------------------------------
+; Name:		StackReader
+; Purpose:	Reads 64-bit values off the stack into registers of
+;		the main register set, effectively testing L1 cache access
+;		*and* effective-address calculation speed.
+; Params:	rdi = loops
+;------------------------------------------------------------------------------
+StackReader:
+_StackReader:
+	push	qword 7000	; [rsp+48]
+	push	qword 6000	; [rsp+40]
+	push	qword 5000	; [rsp+32]
+	push	qword 4000	; [rsp+24]
+	push	qword 3000	; [rsp+16]
+	push	qword 2000	; [rsp+8]
+	push	qword 1000	; [rsp]
+
+.L1:
+	mov	rax, [rsp]
+	mov	rax, [rsp+16]
+	mov	rax, [rsp+24]
+	mov	rax, [rsp+32]
+	mov	rax, [rsp+80]
+	mov	rax, [rsp+8]
+	mov	rax, [rsp+88]
+	mov	rax, [rsp]
+	mov	rax, [rsp]
+	mov	rax, [rsp+16]
+	mov	rax, [rsp+24]
+	mov	rax, [rsp+32]
+	mov	rax, [rsp+80]
+	mov	rax, [rsp+8]
+	mov	rax, [rsp+88]
+	mov	rax, [rsp]
+	mov	rax, [rsp]
+	mov	rax, [rsp+16]
+	mov	rax, [rsp+24]
+	mov	rax, [rsp+32]
+	mov	rax, [rsp+80]
+	mov	rax, [rsp+8]
+	mov	rax, [rsp+88]
+	mov	rax, [rsp+8]
+	mov	rax, [rsp+8]
+	mov	rax, [rsp+16]
+	mov	rax, [rsp+24]
+	mov	rax, [rsp+32]
+	mov	rax, [rsp+80]
+	mov	rax, [rsp+8]
+	mov	rax, [rsp+88]
+	mov	rax, [rsp+8]
+
+	sub	rdi, 1
+	jnz	.L1
+
+	add	rsp, 56
+	ret
+
+;------------------------------------------------------------------------------
+; Name:		StackWriter
+; Purpose:	Writes 64-bit values into the stack from registers of
+;		the main register set, effectively testing L1 cache access
+;		*and* effective-address calculation speed.
+; Params:	rdi = loops
+;------------------------------------------------------------------------------
+StackWriter:
+_StackWriter:
+	push	qword 7000	; [rsp+88]
+	push	qword 6000	; [rsp+80]
+	push	qword 5000	; [rsp+32]
+	push	qword 4000	; [rsp+24]
+	push	qword 3000	; [rsp+16]
+	push	qword 2000	; [rsp+8]
+	push	qword 1000	; [rsp]
+
+	xor	rax, rax
+
+.L1:
+	mov	[rsp], rax
+	mov	[rsp+16], rax
+	mov	[rsp+24], rax
+	mov	[rsp+32], rax
+	mov	[rsp+80], rax
+	mov	[rsp+8], rax
+	mov	[rsp+88], rax
+	mov	[rsp], rax
+	mov	[rsp], rax
+	mov	[rsp+16], rax
+	mov	[rsp+24], rax
+	mov	[rsp+32], rax
+	mov	[rsp+80], rax
+	mov	[rsp+8], rax
+	mov	[rsp+88], rax
+	mov	[rsp], rax
+	mov	[rsp], rax
+	mov	[rsp+16], rax
+	mov	[rsp+24], rax
+	mov	[rsp+32], rax
+	mov	[rsp+80], rax
+	mov	[rsp+8], rax
+	mov	[rsp+88], rax
+	mov	[rsp+8], rax
+	mov	[rsp+8], rax
+	mov	[rsp+16], rax
+	mov	[rsp+24], rax
+	mov	[rsp+32], rax
+	mov	[rsp+80], rax
+	mov	[rsp+8], rax
+	mov	[rsp+88], rax
+	mov	[rsp+8], rax
+
+	sub	rdi, 1
+	jnz	.L1
+
+	add	rsp, 56
+	ret
+
+;------------------------------------------------------------------------------
+; Name:		RegisterToRegister
+; Purpose:	Reads/writes 64-bit values between registers of 
+;		the main register set.
+; Params:	rdi = loops
+;------------------------------------------------------------------------------
+RegisterToRegister:
+_RegisterToRegister:
+.L1:
+	mov	rax, rbx
+	mov	rax, rcx
+	mov	rax, rdx
+	mov	rax, rsi
+	mov	rax, rdi
+	mov	rax, rbp
+	mov	rax, rsp
+	mov	rax, rbx
+	mov	rax, rbx
+	mov	rax, rcx
+	mov	rax, rdx
+	mov	rax, rsi
+	mov	rax, rdi
+	mov	rax, rbp
+	mov	rax, rsp
+	mov	rax, rbx
+	mov	rax, rbx
+	mov	rax, rcx
+	mov	rax, rdx
+	mov	rax, rsi
+	mov	rax, rdi
+	mov	rax, rbp
+	mov	rax, rsp
+	mov	rax, rbx
+	mov	rax, rbx
+	mov	rax, rcx
+	mov	rax, rdx
+	mov	rax, rsi
+	mov	rax, rdi
+	mov	rax, rbp
+	mov	rax, rsp
+	mov	rax, rbx
+
+	sub	rdi, 1
+	jnz	.L1
+	ret
+
+;------------------------------------------------------------------------------
+; Name:		VectorToVector
+; Purpose:	Reads/writes 128-bit values between registers of 
+;		the vector register set, in this case XMM.
+;		(I don't have access to anything with YMM.)
+; Params:	rdi = loops
+;------------------------------------------------------------------------------
+VectorToVector:
+_VectorToVector:
+.L1:
+	movdqa	xmm0, xmm1	; Each movdqa moves 16 bytes, so we need 16
+	movdqa	xmm0, xmm2	; moves to transfer a 256 byte chunk.
+	movdqa	xmm0, xmm3
+	movdqa	xmm2, xmm0
+	movdqa	xmm1, xmm2
+	movdqa	xmm2, xmm1
+	movdqa	xmm0, xmm3
+	movdqa	xmm3, xmm1
+
+	movdqa	xmm3, xmm2
+	movdqa	xmm1, xmm3
+	movdqa	xmm2, xmm1
+	movdqa	xmm0, xmm1
+	movdqa	xmm1, xmm2
+	movdqa	xmm0, xmm1
+	movdqa	xmm0, xmm3
+	movdqa	xmm3, xmm0
+
+	sub	rdi, 1
+	jnz	.L1
+	ret
+
+;------------------------------------------------------------------------------
+; Name:		RegisterToVector
+; Purpose:	Writes 64-bit main register values into 128-bit vector register
+;		clearing the upper unused bits.
+; Params:	rdi = loops
+;------------------------------------------------------------------------------
+RegisterToVector:
+_RegisterToVector:
+.L1:
+	movq	xmm1, rax 	; Each movq transfers 8 bytes, so we need
+	movq	xmm2, rsi	; 32 transfers to move a 256-byte chunk.
+	movq	xmm3, rbx
+	movq	xmm1, rcx
+	movq	xmm2, rsi
+	movq	xmm3, rsp
+	movq	xmm0, rdi
+	movq	xmm0, rdx
+
+	movq	xmm0, rax 	
+	movq	xmm1, rsi
+	movq	xmm2, rbx
+	movq	xmm3, rcx
+	movq	xmm0, rsi
+	movq	xmm3, rsp
+	movq	xmm2, rdi
+	movq	xmm1, rdx
+
+	movq	xmm0, rax 	
+	movq	xmm1, rsi
+	movq	xmm2, rbx
+	movq	xmm3, rcx
+	movq	xmm0, rsi
+	movq	xmm3, rsp
+	movq	xmm2, rdi
+	movq	xmm1, rdx
+
+	movq	xmm0, rax 	
+	movq	xmm1, rsi
+	movq	xmm2, rbx
+	movq	xmm3, rcx
+	movq	xmm0, rsi
+	movq	xmm3, rsp
+	movq	xmm2, rdi
+	movq	xmm1, rdx
+
+	dec	rdi
+	jnz .L1
+	ret
+
+;------------------------------------------------------------------------------
+; Name:		VectorToRegister
+; Purpose:	Writes lower 64 bits of vector register into 64-bit main 
+;		register.
+; Params:	rdi = loops
+;------------------------------------------------------------------------------
+VectorToRegister:
+_VectorToRegister:
+.L1:
+	movq	rax, xmm1
+	movq	rax, xmm2
+	movq	rax, xmm3
+	movq	rax, xmm1
+	movq	rax, xmm2
+	movq	rax, xmm3
+	movq	rax, xmm0
+	movq	rax, xmm0
+
+	movq	rax, xmm0
+	movq	rax, xmm1
+	movq	rax, xmm2
+	movq	rax, xmm3
+	movq	rax, xmm0
+	movq	rax, xmm3
+	movq	rax, xmm2
+	movq	rax, xmm1
+
+	movq	rax, xmm0
+	movq	rax, xmm1
+	movq	rax, xmm2
+	movq	rax, xmm3
+	movq	rax, xmm0
+	movq	rax, xmm3
+	movq	rax, xmm2
+	movq	rax, xmm1
+
+	movq	rax, xmm0
+	movq	rax, xmm1
+	movq	rax, xmm2
+	movq	rax, xmm3
+	movq	rax, xmm0
+	movq	rax, xmm3
+	movq	rax, xmm2
+	movq	rax, xmm1
+
+	dec	rdi
+	jnz .L1
+	ret
+
+;------------------------------------------------------------------------------
+; Name:		Register8ToVector
+; Purpose:	Writes 8-bit main register values into 128-bit vector register
+;		without clearing the unused bits.
+; Params:	rdi = loops
+;------------------------------------------------------------------------------
+Register8ToVector:
+_Register8ToVector:
+	sal	rdi, 2  	; Force some repetition.
+.L1:
+	pinsrb	xmm1, al, 0
+	pinsrb	xmm2, bl, 1
+	pinsrb	xmm3, cl, 2
+	pinsrb	xmm1, dl, 3
+	pinsrb	xmm2, sil, 4
+	pinsrb	xmm3, dil, 5
+	pinsrb	xmm0, bpl, 6
+	pinsrb	xmm0, spl, 7
+
+	pinsrb	xmm0, al, 0
+	pinsrb	xmm1, bl, 1
+	pinsrb	xmm2, cl, 2
+	pinsrb	xmm3, dl, 3
+	pinsrb	xmm3, al, 4
+	pinsrb	xmm2, bl, 5
+	pinsrb	xmm1, bpl, 6
+	pinsrb	xmm0, spl, 7
+
+	pinsrb	xmm1, r8b, 0
+	pinsrb	xmm2, r9b, 1
+	pinsrb	xmm3, r10b, 2
+	pinsrb	xmm1, r11b, 3
+	pinsrb	xmm2, r12b, 4
+	pinsrb	xmm3, al, 5
+	pinsrb	xmm0, cl, 6
+	pinsrb	xmm0, bl, 7
+
+	pinsrb	xmm0, r8b, 0
+	pinsrb	xmm0, r9b, 1
+	pinsrb	xmm0, r10b, 2
+	pinsrb	xmm0, r11b, 3
+	pinsrb	xmm0, r12b, 4
+	pinsrb	xmm0, al, 5
+	pinsrb	xmm0, cl, 6
+	pinsrb	xmm0, bl, 7
+
+	pinsrb	xmm1, al, 0
+	pinsrb	xmm2, bl, 1
+	pinsrb	xmm3, cl, 2
+	pinsrb	xmm1, dl, 3
+	pinsrb	xmm2, sil, 4
+	pinsrb	xmm3, dil, 5
+	pinsrb	xmm0, bpl, 6
+	pinsrb	xmm0, spl, 7
+
+	pinsrb	xmm0, al, 10
+	pinsrb	xmm1, bl, 11
+	pinsrb	xmm2, cl, 12
+	pinsrb	xmm3, dl, 13
+	pinsrb	xmm3, dil, 14
+	pinsrb	xmm2, cl, 15
+	pinsrb	xmm1, al, 6
+	pinsrb	xmm0, bpl, 7
+
+	pinsrb	xmm1, r8b, 10
+	pinsrb	xmm2, r9b, 11
+	pinsrb	xmm3, r10b, 12
+	pinsrb	xmm1, r11b, 13
+	pinsrb	xmm2, r12b, 14
+	pinsrb	xmm3, al, 15
+	pinsrb	xmm0, cl, 6
+	pinsrb	xmm0, bl, 7
+
+	pinsrb	xmm0, r8b, 9
+	pinsrb	xmm0, r9b, 8
+	pinsrb	xmm0, r10b, 11
+	pinsrb	xmm0, r11b, 3
+	pinsrb	xmm0, r12b, 4
+	pinsrb	xmm0, al, 5
+	pinsrb	xmm0, cl, 6
+	pinsrb	xmm0, bl, 7
+
+	dec	rdi
+	jnz .L1
+	ret
+
+;------------------------------------------------------------------------------
+; Name:		Register16ToVector
+; Purpose:	Writes 16-bit main register values into 128-bit vector register
+;		without clearing the unused bits.
+; Params:	rdi = loops
+;------------------------------------------------------------------------------
+Register16ToVector:
+_Register16ToVector:
+	sal	rdi, 1  	; Force some repetition.
+.L1:
+	pinsrw	xmm1, ax, 0
+	pinsrw	xmm2, bx, 1
+	pinsrw	xmm3, cx, 2
+	pinsrw	xmm1, dx, 3
+	pinsrw	xmm2, si, 4
+	pinsrw	xmm3, di, 5
+	pinsrw	xmm0, bp, 6
+	pinsrw	xmm0, sp, 7
+
+	pinsrw	xmm0, ax, 0
+	pinsrw	xmm1, bx, 1
+	pinsrw	xmm2, cx, 2
+	pinsrw	xmm3, dx, 3
+	pinsrw	xmm3, si, 4
+	pinsrw	xmm2, di, 5
+	pinsrw	xmm1, bp, 6
+	pinsrw	xmm0, sp, 7
+
+	pinsrw	xmm1, r8w, 0
+	pinsrw	xmm2, r9w, 1
+	pinsrw	xmm3, r10w, 2
+	pinsrw	xmm1, r11w, 3
+	pinsrw	xmm2, r12w, 4
+	pinsrw	xmm3, ax, 5
+	pinsrw	xmm0, bp, 6
+	pinsrw	xmm0, bx, 7
+
+	pinsrw	xmm0, r8w, 0
+	pinsrw	xmm0, r9w, 1
+	pinsrw	xmm0, r10w, 2
+	pinsrw	xmm0, r11w, 3
+	pinsrw	xmm0, r12w, 4
+	pinsrw	xmm0, ax, 5
+	pinsrw	xmm0, bp, 6
+	pinsrw	xmm0, bx, 7
+
+	pinsrw	xmm1, ax, 0
+	pinsrw	xmm2, bx, 1
+	pinsrw	xmm3, cx, 2
+	pinsrw	xmm1, dx, 3
+	pinsrw	xmm2, si, 4
+	pinsrw	xmm3, di, 5
+	pinsrw	xmm0, bp, 6
+	pinsrw	xmm0, sp, 7
+
+	pinsrw	xmm0, ax, 0
+	pinsrw	xmm1, bx, 1
+	pinsrw	xmm2, cx, 2
+	pinsrw	xmm3, dx, 3
+	pinsrw	xmm3, si, 4
+	pinsrw	xmm2, di, 5
+	pinsrw	xmm1, bp, 6
+	pinsrw	xmm0, sp, 7
+
+	pinsrw	xmm1, r8w, 0
+	pinsrw	xmm2, r9w, 1
+	pinsrw	xmm3, r10w, 2
+	pinsrw	xmm1, r11w, 3
+	pinsrw	xmm2, r12w, 4
+	pinsrw	xmm3, ax, 5
+	pinsrw	xmm0, bp, 6
+	pinsrw	xmm0, bx, 7
+
+	pinsrw	xmm0, r8w, 0
+	pinsrw	xmm0, r9w, 1
+	pinsrw	xmm0, r10w, 2
+	pinsrw	xmm0, r11w, 3
+	pinsrw	xmm0, r12w, 4
+	pinsrw	xmm0, ax, 5
+	pinsrw	xmm0, bp, 6
+	pinsrw	xmm0, bx, 7
+
+	dec	rdi
+	jnz .L1
+	ret
+
+;------------------------------------------------------------------------------
+; Name:		Register32ToVector
+; Purpose:	Writes 32-bit main register values into 128-bit vector register
+;		without clearing the unused bits.
+; Params:	rdi = loops
+;------------------------------------------------------------------------------
+Register32ToVector:
+_Register32ToVector:
+.L1:
+	pinsrd	xmm1, eax, 0	; Each xfer moves 4 bytes so to move 256 bytes
+	pinsrd	xmm2, ebx, 1	; we need 64 transfers.
+	pinsrd	xmm3, ecx, 2
+	pinsrd	xmm1, edx, 3
+	pinsrd	xmm2, esi, 0
+	pinsrd	xmm3, edi, 1
+	pinsrd	xmm0, ebp, 2
+	pinsrd	xmm0, esp, 3
+
+	pinsrd	xmm0, eax, 0
+	pinsrd	xmm1, ebx, 1
+	pinsrd	xmm2, ecx, 2
+	pinsrd	xmm3, edx, 3
+	pinsrd	xmm3, esi, 3
+	pinsrd	xmm2, edi, 2
+	pinsrd	xmm1, ebp, 1
+	pinsrd	xmm0, esp, 0
+
+	pinsrd	xmm1, r8d, 0
+	pinsrd	xmm2, r9d, 1
+	pinsrd	xmm3, r10d, 2
+	pinsrd	xmm1, r11d, 3
+	pinsrd	xmm2, r12d, 0
+	pinsrd	xmm3, eax, 1
+	pinsrd	xmm0, ebp, 2
+	pinsrd	xmm0, ebx, 3
+
+	pinsrd	xmm0, r8d, 0
+	pinsrd	xmm0, r9d, 1
+	pinsrd	xmm0, r10d, 2
+	pinsrd	xmm0, r11d, 3
+	pinsrd	xmm0, r12d, 0
+	pinsrd	xmm0, eax, 0
+	pinsrd	xmm0, ebp, 0
+	pinsrd	xmm0, ebx, 0
+
+	pinsrd	xmm1, eax, 0	
+	pinsrd	xmm2, ebx, 1
+	pinsrd	xmm3, ecx, 2
+	pinsrd	xmm1, edx, 3
+	pinsrd	xmm2, esi, 0
+	pinsrd	xmm3, edi, 1
+	pinsrd	xmm0, ebp, 2
+	pinsrd	xmm0, esp, 3
+
+	pinsrd	xmm0, eax, 0
+	pinsrd	xmm1, ebx, 1
+	pinsrd	xmm2, ecx, 2
+	pinsrd	xmm3, edx, 3
+	pinsrd	xmm3, esi, 3
+	pinsrd	xmm2, edi, 2
+	pinsrd	xmm1, ebp, 1
+	pinsrd	xmm0, esp, 0
+
+	pinsrd	xmm1, r8d, 0
+	pinsrd	xmm2, r9d, 1
+	pinsrd	xmm3, r10d, 2
+	pinsrd	xmm1, r11d, 3
+	pinsrd	xmm2, r12d, 0
+	pinsrd	xmm3, eax, 1
+	pinsrd	xmm0, ebp, 2
+	pinsrd	xmm0, ebx, 3
+
+	pinsrd	xmm0, r8d, 0
+	pinsrd	xmm0, r9d, 1
+	pinsrd	xmm0, r10d, 2
+	pinsrd	xmm0, r11d, 3
+	pinsrd	xmm0, r12d, 0
+	pinsrd	xmm0, eax, 0
+	pinsrd	xmm0, ebp, 0
+	pinsrd	xmm0, ebx, 0
+
+	dec	rdi
+	jnz .L1
+	ret
+
+;------------------------------------------------------------------------------
+; Name:		Register64ToVector
+; Purpose:	Writes 64-bit main register values into 128-bit vector register
+;		without clearing the unused bits.
+; Params:	rdi = loops
+;------------------------------------------------------------------------------
+Register64ToVector:
+_Register64ToVector:
+	add	rdi, rdi
+.L1:
+	pinsrq	xmm1, r8, 0	; Each xfer moves 8 bytes, therefore to do
+	pinsrq	xmm2, r9, 1	; 256 bytes we need 32 transfers.
+	pinsrq	xmm3, r10, 0
+	pinsrq	xmm1, r11, 1
+	pinsrq	xmm2, r12, 0
+	pinsrq	xmm3, rax, 1
+	pinsrq	xmm0, rbp, 0
+	pinsrq	xmm0, rbx, 1
+
+	pinsrq	xmm0, r8, 0
+	pinsrq	xmm0, r9, 1
+	pinsrq	xmm0, r10, 1
+	pinsrq	xmm0, r11, 1
+	pinsrq	xmm0, r12, 0
+	pinsrq	xmm0, rax, 0
+	pinsrq	xmm0, rbp, 0
+	pinsrq	xmm0, rbx, 0
+
+	dec	rdi
+	jnz .L1
+	ret
+
+
+;------------------------------------------------------------------------------
+; Name:		Vector8ToRegister
+; Purpose:	Writes 8-bit vector register values into main register.
+; Params:	rdi = loops
+;------------------------------------------------------------------------------
+Vector8ToRegister:
+_Vector8ToRegister:
+	sal	rdi, 3  	; Force some repetition.
+.L1:
+	pextrb	rax, xmm1, 0
+	pextrb	rax, xmm2, 1
+	pextrb	rax, xmm3, 2
+	pextrb	rax, xmm1, 3
+	pextrb	rax, xmm2, 4
+	pextrb	rax, xmm3, 5
+	pextrb	rax, xmm0, 6
+	pextrb	rax, xmm0, 7
+
+	pextrb	rax, xmm0, 0
+	pextrb	rax, xmm1, 1
+	pextrb	rax, xmm2, 2
+	pextrb	rax, xmm3, 3
+	pextrb	rax, xmm3, 4
+	pextrb	rax, xmm2, 5
+	pextrb	rax, xmm1, 6
+	pextrb	rax, xmm0, 7
+
+	pextrb	rax, xmm1, 0
+	pextrb	rax, xmm2, 1
+	pextrb	rax, xmm3, 2
+	pextrb	rax, xmm1, 3
+	pextrb	rax, xmm2, 4
+	pextrb	rax, xmm3, 5
+	pextrb	rax, xmm0, 6
+	pextrb	rax, xmm0, 7
+
+	pextrb	rax, xmm0, 0
+	pextrb	rax, xmm0, 1
+	pextrb	rax, xmm0, 2
+	pextrb	rax, xmm0, 3
+	pextrb	rax, xmm0, 4
+	pextrb	rax, xmm0, 5
+	pextrb	rax, xmm0, 6
+	pextrb	rax, xmm0, 7
+
+	dec	rdi
+	jnz .L1
+	ret
+
+;------------------------------------------------------------------------------
+; Name:		Vector16ToRegister
+; Purpose:	Writes 16-bit vector register values into main register.
+; Params:	rdi = loops
+;------------------------------------------------------------------------------
+Vector16ToRegister:
+_Vector16ToRegister:
+	sal	rdi, 2  	; Force some repetition.
+.L1:
+	pextrw	rax, xmm1, 0	; 256 byte chunk / 2 bytes/xfer = 128 xfers.
+	pextrw	rax, xmm2, 1
+	pextrw	rax, xmm3, 2
+	pextrw	rax, xmm1, 3
+	pextrw	rax, xmm2, 4
+	pextrw	rax, xmm3, 5
+	pextrw	rax, xmm0, 6
+	pextrw	rax, xmm0, 7
+
+	pextrw	rax, xmm0, 0
+	pextrw	rax, xmm1, 1
+	pextrw	rax, xmm2, 2
+	pextrw	rax, xmm3, 3
+	pextrw	rax, xmm3, 4
+	pextrw	rax, xmm2, 5
+	pextrw	rax, xmm1, 6
+	pextrw	rax, xmm0, 7
+
+	pextrw	rax, xmm1, 0
+	pextrw	rax, xmm2, 1
+	pextrw	rax, xmm3, 2
+	pextrw	rax, xmm1, 3
+	pextrw	rax, xmm2, 4
+	pextrw	rax, xmm3, 5
+	pextrw	rax, xmm0, 6
+	pextrw	rax, xmm0, 7
+
+	pextrw	rax, xmm0, 0
+	pextrw	rax, xmm0, 1
+	pextrw	rax, xmm0, 2
+	pextrw	rax, xmm0, 3
+	pextrw	rax, xmm0, 4
+	pextrw	rax, xmm0, 5
+	pextrw	rax, xmm0, 6
+	pextrw	rax, xmm0, 7
+
+	dec	rdi
+	jnz .L1
+	ret
+
+;------------------------------------------------------------------------------
+; Name:		Vector32ToRegister
+; Purpose:	Writes 32-bit vector register values into main register.
+; Params:	rdi = loops
+;------------------------------------------------------------------------------
+Vector32ToRegister:
+_Vector32ToRegister:
+	add 	rdi, rdi
+.L1:
+	pextrd	eax, xmm1, 0	; 256 byte chunk / 4 bytes/xfer = 64 xfers.
+	pextrd	eax, xmm2, 1
+	pextrd	eax, xmm3, 2
+	pextrd	eax, xmm1, 3
+	pextrd	eax, xmm2, 0
+	pextrd	eax, xmm3, 1
+	pextrd	eax, xmm0, 2
+	pextrd	eax, xmm0, 3
+
+	pextrd	eax, xmm0, 0
+	pextrd	eax, xmm1, 1
+	pextrd	eax, xmm2, 2
+	pextrd	eax, xmm3, 3
+	pextrd	eax, xmm3, 3
+	pextrd	eax, xmm2, 2
+	pextrd	eax, xmm1, 1
+	pextrd	eax, xmm0, 0
+
+	pextrd	eax, xmm1, 0
+	pextrd	eax, xmm2, 1
+	pextrd	eax, xmm3, 2
+	pextrd	eax, xmm1, 3
+	pextrd	eax, xmm2, 0
+	pextrd	eax, xmm3, 1
+	pextrd	eax, xmm0, 2
+	pextrd	eax, xmm0, 3
+
+	pextrd	eax, xmm0, 0
+	pextrd	eax, xmm0, 1
+	pextrd	eax, xmm0, 2
+	pextrd	eax, xmm0, 3
+	pextrd	eax, xmm0, 0
+	pextrd	eax, xmm0, 0
+	pextrd	eax, xmm0, 0
+	pextrd	eax, xmm0, 0
+
+	dec	rdi
+	jnz .L1
+	ret
+
+;------------------------------------------------------------------------------
+; Name:		Vector64ToRegister
+; Purpose:	Writes 64-bit vector register values into main register.
+; Params:	rdi = loops
+;------------------------------------------------------------------------------
+Vector64ToRegister:
+_Vector64ToRegister:
+	add	rdi, rdi
+.L1:
+	pextrq	rax, xmm1, 0	; 256 byte chunk / 8 bytes/xfer = 32 xfers.
+	pextrq	rax, xmm2, 1
+	pextrq	rax, xmm3, 0
+	pextrq	rax, xmm1, 1
+	pextrq	rax, xmm2, 0
+	pextrq	rax, xmm3, 1
+	pextrq	rax, xmm0, 0
+	pextrq	rax, xmm0, 1
+
+	pextrq	rax, xmm0, 0
+	pextrq	rax, xmm0, 1
+	pextrq	rax, xmm0, 1
+	pextrq	rax, xmm0, 1
+	pextrq	rax, xmm0, 0
+	pextrq	rax, xmm0, 0
+	pextrq	rax, xmm0, 0
+	pextrq	rax, xmm0, 0
+
+	dec	rdi
+	jnz .L1
+	ret
+
+;------------------------------------------------------------------------------
+; Name:		CopySSE
+; Purpose:	Copies memory chunks that are 16-byte aligned.
+; Params:	rdi = ptr to destination memory area
+;		rsi = ptr to source memory area
+; 		rdx = length in bytes
+; 		rcx = loops
+;------------------------------------------------------------------------------
+CopySSE:
+_CopySSE:
+	push	r10
+
+	shr	rdx, 8	; Ensure length is multiple of 256.
+	shl	rdx, 8
+
+	; Save our non-parameter XMM registers.
+	sub	rsp, 192
+	movdqu	[rsp], xmm4
+	movdqu	[16+rsp], xmm5
+	movdqu	[32+rsp], xmm6
+	movdqu	[48+rsp], xmm7
+	movdqu	[64+rsp], xmm8
+	movdqu	[80+rsp], xmm9
+	movdqu	[96+rsp], xmm10
+	movdqu	[112+rsp], xmm11
+	movdqu	[128+rsp], xmm12
+	movdqu	[144+rsp], xmm13
+	movdqu	[160+rsp], xmm14
+	movdqu	[176+rsp], xmm15
+
+.L1:
+	mov	r10, rdx
+
+.L2:
+	; prefetchnta	[rsi]
+	movdqa	xmm0, [rsi]
+	movdqa	xmm1, [16+rsi]
+	movdqa	xmm2, [32+rsi]
+	movdqa	xmm3, [48+rsi]
+	movdqa	xmm4, [64+rsi]
+	movdqa	xmm5, [80+rsi]
+	movdqa	xmm6, [96+rsi]
+	movdqa	xmm7, [112+rsi]
+	movdqa	xmm8, [128+rsi]
+	movdqa	xmm9, [144+rsi]
+	movdqa	xmm10, [160+rsi]
+	movdqa	xmm11, [176+rsi]
+	movdqa	xmm12, [192+rsi]
+	movdqa	xmm13, [208+rsi]
+	movdqa	xmm14, [224+rsi]
+	movdqa	xmm15, [240+rsi]
+
+	movntdq	[rdi], xmm0
+	movntdq	[16+rdi], xmm1
+	movntdq	[32+rdi], xmm2
+	movntdq	[48+rdi], xmm3
+	movntdq	[64+rdi], xmm4
+	movntdq	[80+rdi], xmm5
+	movntdq	[96+rdi], xmm6
+	movntdq	[112+rdi], xmm7
+	movntdq	[128+rdi], xmm8
+	movntdq	[144+rdi], xmm9
+	movntdq	[160+rdi], xmm10
+	movntdq	[176+rdi], xmm11
+	movntdq	[192+rdi], xmm12
+	movntdq	[208+rdi], xmm13
+	movntdq	[224+rdi], xmm14
+	movntdq	[240+rdi], xmm15
+
+	add	rsi, 256
+	add	rdi, 256
+
+	sub	r10, 256
+	jnz	.L2
+
+	sub	rsi, rdx	; rsi now points to start.
+	sub	rdi, rdx	; rdi now points to start.
+
+	dec	rcx
+	jnz	.L1
+
+	movdqu	xmm0, [rsp]
+	movdqu	xmm1, [16+rsp]
+	movdqu	xmm2, [32+rsp]
+	movdqu	xmm3, [48+rsp]
+	movdqu	xmm4, [64+rsp]
+	movdqu	xmm5, [80+rsp]
+	movdqu	xmm6, [96+rsp]
+	movdqu	xmm7, [112+rsp]
+	movdqu	xmm8, [128+rsp]
+	movdqu	xmm9, [144+rsp]
+	movdqu	xmm10, [160+rsp]
+	movdqu	xmm11, [176+rsp]
+	movdqu	xmm12, [192+rsp]
+	movdqu	xmm13, [208+rsp]
+	movdqu	xmm14, [224+rsp]
+	movdqu	xmm15, [240+rsp]
+	add	rsp, 192
+
+	pop	r10
+
+	ret
+
diff --git a/routinesARM.S b/routinesARM.S
new file mode 100644
index 0000000..550c68e
--- /dev/null
+++ b/routinesARM.S
@@ -0,0 +1,644 @@
+
+# ============================================================================
+#  bandwidth 0.23, a benchmark to estimate memory transfer bandwidth.
+#  ARM assembly module.
+#  Copyright (C) 2010 by Zack T Smith.
+#
+#  This program is free software; you can redistribute it and/or modify
+#  it under the terms of the GNU General Public License as published by
+#  the Free Software Foundation; either version 2 of the License, or
+#  (at your option) any later version.
+#
+#  This program is distributed in the hope that it will be useful,
+#  but WITHOUT ANY WARRANTY; without even the implied warranty of
+#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#  GNU General Public License for more details.
+#
+#  You should have received a copy of the GNU General Public License
+#  along with this program; if not, write to the Free Software
+#  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+#
+#  The author may be reached at fbui@comcast.net.
+# =============================================================================
+
+#--------------
+# Version 0.7
+#--------------
+
+#include "config.h"
+
+#ifdef CONFIG_CPU_ARM1136JS
+.arch armv6k
+.fpu softvfp
+#elif CONFIG_CPU_CORTEXA9_HF
+.arch armv7-a
+.fpu neon
+#elif CONFIG_CPU_CORTEXA9
+.arch armv7-a
+.fpu softvfp
+#endif
+
+#ifdef __thumb2__
+.syntax unified
+.code 16
+#endif
+
+.section code
+
+.text
+.align 2
+
+.global Writer
+.global RandomWriter
+
+.global Reader
+.global RandomReader
+
+.global	RegisterToRegister
+.global	StackReader
+.global	StackWriter
+
+#-----------------------------------------------------------------------------
+# Name: 	Writer
+# Purpose:	Performs sequential write into memory, as fast as possible.
+# Params:
+#	r0 = address
+#	r1 = length, multiple of 256
+#	r2 = count
+# 	r3 = value to write
+#-----------------------------------------------------------------------------
+Writer:
+	stmfd	sp!,{r4, r5, r6, r7, r8, r9, r10, r11, r12, lr}
+
+# r4 = temp
+# r5 = temp
+
+	and	r1, #0xffffff80
+	mov	r4, r0
+	mov	r5, r1
+
+	mov	r6, r3
+	mov	r7, r3
+	mov	r8, r3
+	mov	r9, r3
+	mov	r10, r3
+	mov	r11, r3
+	mov	r12, r3
+
+.L0:
+	mov	r0, r4
+	mov	r1, r5
+
+.L1:
+# Does 64 transfers, 4 bytes each = 256 bytes total.
+# The "stmia" instruction automatically increments r0.
+        stmia   r0!, { r3, r6, r7, r8, r9, r10, r11, r12 }
+        stmia   r0!, { r3, r6, r7, r8, r9, r10, r11, r12 }
+        stmia   r0!, { r3, r6, r7, r8, r9, r10, r11, r12 }
+        stmia   r0!, { r3, r6, r7, r8, r9, r10, r11, r12 }
+        stmia   r0!, { r3, r6, r7, r8, r9, r10, r11, r12 }
+        stmia   r0!, { r3, r6, r7, r8, r9, r10, r11, r12 }
+        stmia   r0!, { r3, r6, r7, r8, r9, r10, r11, r12 }
+        stmia   r0!, { r3, r6, r7, r8, r9, r10, r11, r12 }
+
+	sub	r1, #256
+	cmp	r1, #0
+	bne	.L1
+
+	sub	r2, #1
+	cmp	r2, #0
+	bne	.L0
+
+# return.
+	ldmfd	sp!,{r4, r5, r6, r7, r8, r9, r10, r11, r12, pc}
+
+#-----------------------------------------------------------------------------
+# Name: 	Reader
+# Purpose:	Performs sequential reads from memory, as fast as possible.
+# Params:
+#	r0 = address
+#	r1 = length, multiple of 256
+#	r2 = count
+#-----------------------------------------------------------------------------
+Reader:
+	stmfd	sp!,{r4, r5, r6, r7, r8, r9, r10, r11, r12, lr}
+
+# r3 = temp
+
+	and	r1, #0xffffff80
+	mov	r4, r0
+	mov	r5, r1
+
+.L2:
+	mov	r0, r4
+	mov	r1, r5
+
+.L3:
+# Does 64 transfers, 4 bytes each = 256 bytes total.
+# The "ldmia" instruction automatically increments r0.
+
+	ldmia	r0!, { r3, r6, r7, r8, r9, r10, r11, r12 }
+	ldmia	r0!, { r3, r6, r7, r8, r9, r10, r11, r12 }
+	ldmia	r0!, { r3, r6, r7, r8, r9, r10, r11, r12 }
+	ldmia	r0!, { r3, r6, r7, r8, r9, r10, r11, r12 }
+	ldmia	r0!, { r3, r6, r7, r8, r9, r10, r11, r12 }
+	ldmia	r0!, { r3, r6, r7, r8, r9, r10, r11, r12 }
+	ldmia	r0!, { r3, r6, r7, r8, r9, r10, r11, r12 }
+	ldmia	r0!, { r3, r6, r7, r8, r9, r10, r11, r12 }
+
+	sub	r1, #256
+	cmp	r1, #0
+	bne	.L3
+
+	sub	r2, #1
+	cmp	r2, #0
+ 	bne	.L2
+
+# return.
+	ldmfd	sp!,{r4, r5, r6, r7, r8, r9, r10, r11, r12, pc}
+
+#-----------------------------------------------------------------------------
+# Name: 	RandomWriter
+# Purpose:	Performs random write into memory, as fast as possible.
+# Params:
+# 	r0 = pointer to array of chunk pointers
+# 	r1 = # of 256-byte chunks
+# 	r2 = # loops to do
+# 	r3 = value to write
+#-----------------------------------------------------------------------------
+RandomWriter:
+	stmfd	sp!,{r4, r5, lr}
+
+# r4 = temp
+# r5 = temp
+
+.L4:
+	mov	r5, #0
+
+.L5:
+# Get pointer to chunk in memory.
+	ldr	r4, [r0, r5, LSL #2]
+
+# Does 64 transfers, 4 bytes each = 256 bytes total.
+
+	str	r3, [r4, #160]
+	str	r3, [r4, #232]
+	str	r3, [r4, #224]
+	str	r3, [r4, #96]
+	str	r3, [r4, #164]
+	str	r3, [r4, #76]
+	str	r3, [r4, #100]
+	str	r3, [r4, #220]
+	str	r3, [r4, #248]
+	str	r3, [r4, #104]
+	str	r3, [r4, #4]
+	str	r3, [r4, #136]
+	str	r3, [r4, #112]
+	str	r3, [r4, #200]
+	str	r3, [r4, #12]
+	str	r3, [r4, #128]
+	str	r3, [r4, #148]
+	str	r3, [r4, #196]
+	str	r3, [r4, #216]
+	str	r3, [r4]
+	str	r3, [r4, #84]
+	str	r3, [r4, #140]
+	str	r3, [r4, #204]
+	str	r3, [r4, #184]
+	str	r3, [r4, #124]
+	str	r3, [r4, #48]
+	str	r3, [r4, #64]
+	str	r3, [r4, #212]
+	str	r3, [r4, #240]
+	str	r3, [r4, #236]
+	str	r3, [r4, #24]
+	str	r3, [r4, #252]
+	str	r3, [r4, #68]
+	str	r3, [r4, #20]
+	str	r3, [r4, #72]
+	str	r3, [r4, #32]
+	str	r3, [r4, #28]
+	str	r3, [r4, #52]
+	str	r3, [r4, #244]
+	str	r3, [r4, #180]
+	str	r3, [r4, #80]
+	str	r3, [r4, #60]
+	str	r3, [r4, #8]
+	str	r3, [r4, #56]
+	str	r3, [r4, #208]
+	str	r3, [r4, #228]
+	str	r3, [r4, #40]
+	str	r3, [r4, #172]
+	str	r3, [r4, #120]
+	str	r3, [r4, #176]
+	str	r3, [r4, #108]
+	str	r3, [r4, #132]
+	str	r3, [r4, #16]
+	str	r3, [r4, #44]
+	str	r3, [r4, #92]
+	str	r3, [r4, #168]
+	str	r3, [r4, #152]
+	str	r3, [r4, #156]
+	str	r3, [r4, #188]
+	str	r3, [r4, #36]
+	str	r3, [r4, #88]
+	str	r3, [r4, #116]
+	str	r3, [r4, #192]
+	str	r3, [r4, #144]
+
+	add	r5, #1
+	cmp	r5, r1
+	bne	.L5
+
+	sub	r2, #1
+	cmp	r2, #0
+	bne	.L4
+
+# return.
+	ldmfd	sp!,{r4, r5, pc}
+
+#-----------------------------------------------------------------------------
+# Name: 	RandomReader
+# Purpose:	Performs random reads from memory, as fast as possible.
+# Params:
+# 	r0 = pointer to array of chunk pointers
+# 	r1 = # of 256-byte chunks
+# 	r2 = # loops to do
+#-----------------------------------------------------------------------------
+RandomReader:
+	stmfd	sp!,{r4, r5, lr}
+
+# r3 = temp
+# r4 = temp
+# r5 = temp
+
+.L6:
+	mov	r5, #0
+
+.L7:
+# Get pointer to chunk in memory.
+	ldr	r4, [r0, r5, LSL #2]
+
+# Does 64 transfers, 4 bytes each = 256 bytes total.
+
+	ldr	r3, [r4, #160]
+	ldr	r3, [r4, #232]
+	ldr	r3, [r4, #224]
+	ldr	r3, [r4, #96]
+	ldr	r3, [r4, #164]
+	ldr	r3, [r4, #76]
+	ldr	r3, [r4, #100]
+	ldr	r3, [r4, #220]
+	ldr	r3, [r4, #248]
+	ldr	r3, [r4, #104]
+	ldr	r3, [r4, #4]
+	ldr	r3, [r4, #136]
+	ldr	r3, [r4, #112]
+	ldr	r3, [r4, #200]
+	ldr	r3, [r4, #12]
+	ldr	r3, [r4, #128]
+	ldr	r3, [r4, #148]
+	ldr	r3, [r4, #196]
+	ldr	r3, [r4, #216]
+	ldr	r3, [r4]
+	ldr	r3, [r4, #84]
+	ldr	r3, [r4, #140]
+	ldr	r3, [r4, #204]
+	ldr	r3, [r4, #184]
+	ldr	r3, [r4, #124]
+	ldr	r3, [r4, #48]
+	ldr	r3, [r4, #64]
+	ldr	r3, [r4, #212]
+	ldr	r3, [r4, #240]
+	ldr	r3, [r4, #236]
+	ldr	r3, [r4, #24]
+	ldr	r3, [r4, #252]
+	ldr	r3, [r4, #68]
+	ldr	r3, [r4, #20]
+	ldr	r3, [r4, #72]
+	ldr	r3, [r4, #32]
+	ldr	r3, [r4, #28]
+	ldr	r3, [r4, #52]
+	ldr	r3, [r4, #244]
+	ldr	r3, [r4, #180]
+	ldr	r3, [r4, #80]
+	ldr	r3, [r4, #60]
+	ldr	r3, [r4, #8]
+	ldr	r3, [r4, #56]
+	ldr	r3, [r4, #208]
+	ldr	r3, [r4, #228]
+	ldr	r3, [r4, #40]
+	ldr	r3, [r4, #172]
+	ldr	r3, [r4, #120]
+	ldr	r3, [r4, #176]
+	ldr	r3, [r4, #108]
+	ldr	r3, [r4, #132]
+	ldr	r3, [r4, #16]
+	ldr	r3, [r4, #44]
+	ldr	r3, [r4, #92]
+	ldr	r3, [r4, #168]
+	ldr	r3, [r4, #152]
+	ldr	r3, [r4, #156]
+	ldr	r3, [r4, #188]
+	ldr	r3, [r4, #36]
+	ldr	r3, [r4, #88]
+	ldr	r3, [r4, #116]
+	ldr	r3, [r4, #192]
+	ldr	r3, [r4, #144]
+
+	add	r5, #1
+	cmp	r5, r1
+	bne	.L7
+
+	sub	r2, #1
+	cmp	r2, #0
+	bne	.L6
+
+# return.
+	ldmfd	sp!,{r4, r5, pc}
+
+#-----------------------------------------------------------------------------
+# Name: 	RegisterToRegister
+# Purpose:	Performs register-to-register transfers.
+# Params:
+#	r0 = count
+#-----------------------------------------------------------------------------
+RegisterToRegister:
+	stmfd	sp!,{lr}
+
+# r1 = temp
+
+.L8:
+# Does 64 transfers, 4 bytes each = 256 bytes total.
+
+	mov	r1, r2
+	mov	r1, r3
+	mov	r1, r4
+	mov	r1, r5
+	mov	r1, r6
+	mov	r1, r7
+	mov	r1, r8
+	mov	r1, r9
+
+	mov	r2, r1
+	mov	r2, r3
+	mov	r2, r4
+	mov	r2, r5
+	mov	r2, r6
+	mov	r2, r7
+	mov	r2, r8
+	mov	r2, r9
+
+	mov	r1, r2
+	mov	r1, r3
+	mov	r1, r4
+	mov	r1, r5
+	mov	r1, r6
+	mov	r1, r7
+	mov	r1, r8
+	mov	r1, r9
+
+	mov	r1, r2
+	mov	r1, r3
+	mov	r1, r4
+	mov	r1, r5
+	mov	r1, r6
+	mov	r1, r7
+	mov	r1, r8
+	mov	r1, r9
+
+	mov	r1, r2
+	mov	r1, r3
+	mov	r1, r4
+	mov	r1, r5
+	mov	r1, r6
+	mov	r1, r7
+	mov	r1, r8
+	mov	r1, r9
+
+	mov	r1, r2
+	mov	r1, r3
+	mov	r1, r4
+	mov	r1, r5
+	mov	r1, r6
+	mov	r1, r7
+	mov	r1, r8
+	mov	r1, r9
+
+	mov	r1, r2
+	mov	r1, r3
+	mov	r1, r4
+	mov	r1, r5
+	mov	r1, r6
+	mov	r1, r7
+	mov	r1, r8
+	mov	r1, r9
+
+	mov	r1, r2
+	mov	r1, r3
+	mov	r1, r4
+	mov	r1, r5
+	mov	r1, r6
+	mov	r1, r7
+	mov	r1, r8
+	mov	r1, r9
+
+	sub	r0, #1
+	cmp	r0, #0
+	bne	.L8
+
+# return.
+	ldmfd	sp!,{pc}
+
+#-----------------------------------------------------------------------------
+# Name: 	StackReader
+# Purpose:	Performs stack-to-register transfers.
+# Params:
+#	r0 = count
+#-----------------------------------------------------------------------------
+StackReader:
+	stmfd	sp!,{lr}
+
+# r1 = temp
+
+	sub	sp, #32
+.L9:
+# Does 64 transfers, 4 bytes each = 256 bytes total.
+
+	ldr	r1, [sp]
+	ldr	r1, [sp, #4]
+	ldr	r1, [sp, #8]
+	ldr	r1, [sp, #12]
+	ldr	r1, [sp, #16]
+	ldr	r1, [sp, #20]
+	ldr	r1, [sp, #24]
+	ldr	r1, [sp, #28]
+
+	ldr	r1, [sp]
+	ldr	r1, [sp, #4]
+	ldr	r1, [sp, #8]
+	ldr	r1, [sp, #12]
+	ldr	r1, [sp, #16]
+	ldr	r1, [sp, #20]
+	ldr	r1, [sp, #24]
+	ldr	r1, [sp, #28]
+
+	ldr	r1, [sp]
+	ldr	r1, [sp, #4]
+	ldr	r1, [sp, #8]
+	ldr	r1, [sp, #12]
+	ldr	r1, [sp, #16]
+	ldr	r1, [sp, #20]
+	ldr	r1, [sp, #24]
+	ldr	r1, [sp, #28]
+
+	ldr	r1, [sp]
+	ldr	r1, [sp, #4]
+	ldr	r1, [sp, #8]
+	ldr	r1, [sp, #12]
+	ldr	r1, [sp, #16]
+	ldr	r1, [sp, #20]
+	ldr	r1, [sp, #24]
+	ldr	r1, [sp, #28]
+
+	ldr	r1, [sp]
+	ldr	r1, [sp, #4]
+	ldr	r1, [sp, #8]
+	ldr	r1, [sp, #12]
+	ldr	r1, [sp, #16]
+	ldr	r1, [sp, #20]
+	ldr	r1, [sp, #24]
+	ldr	r1, [sp, #28]
+
+	ldr	r1, [sp]
+	ldr	r1, [sp, #4]
+	ldr	r1, [sp, #8]
+	ldr	r1, [sp, #12]
+	ldr	r1, [sp, #16]
+	ldr	r1, [sp, #20]
+	ldr	r1, [sp, #24]
+	ldr	r1, [sp, #28]
+
+	ldr	r1, [sp]
+	ldr	r1, [sp, #4]
+	ldr	r1, [sp, #8]
+	ldr	r1, [sp, #12]
+	ldr	r1, [sp, #16]
+	ldr	r1, [sp, #20]
+	ldr	r1, [sp, #24]
+	ldr	r1, [sp, #28]
+
+	ldr	r1, [sp]
+	ldr	r1, [sp, #4]
+	ldr	r1, [sp, #8]
+	ldr	r1, [sp, #12]
+	ldr	r1, [sp, #16]
+	ldr	r1, [sp, #20]
+	ldr	r1, [sp, #24]
+	ldr	r1, [sp, #28]
+
+	sub	r0, #1
+	cmp	r0, #0
+	bne	.L9
+
+	add	sp, #32
+
+# return.
+	ldmfd	sp!,{pc}
+
+#-----------------------------------------------------------------------------
+# Name: 	StackWriter
+# Purpose:	Performs register-to-stack transfers.
+# Params:
+#	r0 = count
+#-----------------------------------------------------------------------------
+StackWriter:
+	stmfd	sp!,{lr}
+
+# r1 = temp
+
+	sub	sp, #32
+.L10:
+# Does 64 transfers, 4 bytes each = 256 bytes total.
+
+	str	r1, [sp]
+	str	r1, [sp, #4]
+	str	r1, [sp, #8]
+	str	r1, [sp, #12]
+	str	r1, [sp, #16]
+	str	r1, [sp, #20]
+	str	r1, [sp, #24]
+	str	r1, [sp, #28]
+
+	str	r1, [sp]
+	str	r1, [sp, #4]
+	str	r1, [sp, #8]
+	str	r1, [sp, #12]
+	str	r1, [sp, #16]
+	str	r1, [sp, #20]
+	str	r1, [sp, #24]
+	str	r1, [sp, #28]
+
+	str	r1, [sp]
+	str	r1, [sp, #4]
+	str	r1, [sp, #8]
+	str	r1, [sp, #12]
+	str	r1, [sp, #16]
+	str	r1, [sp, #20]
+	str	r1, [sp, #24]
+	str	r1, [sp, #28]
+
+	str	r1, [sp]
+	str	r1, [sp, #4]
+	str	r1, [sp, #8]
+	str	r1, [sp, #12]
+	str	r1, [sp, #16]
+	str	r1, [sp, #20]
+	str	r1, [sp, #24]
+	str	r1, [sp, #28]
+
+	str	r1, [sp]
+	str	r1, [sp, #4]
+	str	r1, [sp, #8]
+	str	r1, [sp, #12]
+	str	r1, [sp, #16]
+	str	r1, [sp, #20]
+	str	r1, [sp, #24]
+	str	r1, [sp, #28]
+
+	str	r1, [sp]
+	str	r1, [sp, #4]
+	str	r1, [sp, #8]
+	str	r1, [sp, #12]
+	str	r1, [sp, #16]
+	str	r1, [sp, #20]
+	str	r1, [sp, #24]
+	str	r1, [sp, #28]
+
+	str	r1, [sp]
+	str	r1, [sp, #4]
+	str	r1, [sp, #8]
+	str	r1, [sp, #12]
+	str	r1, [sp, #16]
+	str	r1, [sp, #20]
+	str	r1, [sp, #24]
+	str	r1, [sp, #28]
+
+	str	r1, [sp]
+	str	r1, [sp, #4]
+	str	r1, [sp, #8]
+	str	r1, [sp, #12]
+	str	r1, [sp, #16]
+	str	r1, [sp, #20]
+	str	r1, [sp, #24]
+	str	r1, [sp, #28]
+
+	sub	r0, #1
+	cmp	r0, #0
+	bne	.L10
+
+	add	sp, #32
+
+# return.
+	ldmfd	sp!,{pc}
+
diff --git a/routinesARM.asm b/routinesARM.asm
new file mode 100644
index 0000000..27d4230
--- /dev/null
+++ b/routinesARM.asm
@@ -0,0 +1,629 @@
+
+# ============================================================================
+#  bandwidth 0.23, a benchmark to estimate memory transfer bandwidth.
+#  ARM assembly module.
+#  Copyright (C) 2010 by Zack T Smith.
+#
+#  This program is free software; you can redistribute it and/or modify
+#  it under the terms of the GNU General Public License as published by
+#  the Free Software Foundation; either version 2 of the License, or
+#  (at your option) any later version.
+#
+#  This program is distributed in the hope that it will be useful,
+#  but WITHOUT ANY WARRANTY; without even the implied warranty of
+#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#  GNU General Public License for more details.
+#
+#  You should have received a copy of the GNU General Public License
+#  along with this program; if not, write to the Free Software
+#  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+#
+#  The author may be reached at fbui@comcast.net.
+# =============================================================================
+
+#--------------
+# Version 0.7
+#--------------
+
+.arch armv5t
+.fpu softvfp
+
+.section code 
+
+.text
+.align 2
+
+.global Writer
+.global RandomWriter
+
+.global Reader
+.global RandomReader
+
+.global	RegisterToRegister
+.global	StackReader
+.global	StackWriter
+
+#-----------------------------------------------------------------------------
+# Name: 	Writer
+# Purpose:	Performs sequential write into memory, as fast as possible.
+# Params:
+#	r0 = address
+#	r1 = length, multiple of 256
+#	r2 = count
+# 	r3 = value to write
+#-----------------------------------------------------------------------------
+Writer:
+	stmfd	sp!,{r4, r5, r6, r7, r8, r9, r10, r11, r12, lr}
+
+# r4 = temp
+# r5 = temp
+
+	and	r1, #0xffffff80
+	mov	r4, r0
+	mov	r5, r1
+
+	mov	r6, r3
+	mov	r7, r3
+	mov	r8, r3
+	mov	r9, r3
+	mov	r10, r3
+	mov	r11, r3
+	mov	r12, r3
+
+.L0:
+	mov	r0, r4
+	mov	r1, r5
+
+.L1:
+# Does 64 transfers, 4 bytes each = 256 bytes total.
+# The "stmia" instruction automatically increments r0.
+        stmia   r0!, { r3, r6, r7, r8, r9, r10, r11, r12 }
+        stmia   r0!, { r3, r6, r7, r8, r9, r10, r11, r12 }
+        stmia   r0!, { r3, r6, r7, r8, r9, r10, r11, r12 }
+        stmia   r0!, { r3, r6, r7, r8, r9, r10, r11, r12 }
+        stmia   r0!, { r3, r6, r7, r8, r9, r10, r11, r12 }
+        stmia   r0!, { r3, r6, r7, r8, r9, r10, r11, r12 }
+        stmia   r0!, { r3, r6, r7, r8, r9, r10, r11, r12 }
+        stmia   r0!, { r3, r6, r7, r8, r9, r10, r11, r12 }
+
+	sub	r1, #256
+	cmp	r1, #0
+	bne	.L1
+
+	sub	r2, #1
+	cmp	r2, #0
+	bne	.L0
+
+# return.
+	ldmfd	sp!,{r4, r5, r6, r7, r8, r9, r10, r11, r12, pc}
+
+#-----------------------------------------------------------------------------
+# Name: 	Reader
+# Purpose:	Performs sequential reads from memory, as fast as possible.
+# Params:
+#	r0 = address
+#	r1 = length, multiple of 256
+#	r2 = count
+#-----------------------------------------------------------------------------
+Reader:
+	stmfd	sp!,{r4, r5, r6, r7, r8, r9, r10, r11, r12, lr}
+
+# r3 = temp
+
+	and	r1, #0xffffff80
+	mov	r4, r0
+	mov	r5, r1
+
+.L2:
+	mov	r0, r4
+	mov	r1, r5
+
+.L3:
+# Does 64 transfers, 4 bytes each = 256 bytes total.
+# The "ldmia" instruction automatically increments r0.
+
+	ldmia	r0!, { r3, r6, r7, r8, r9, r10, r11, r12 }
+	ldmia	r0!, { r3, r6, r7, r8, r9, r10, r11, r12 }
+	ldmia	r0!, { r3, r6, r7, r8, r9, r10, r11, r12 }
+	ldmia	r0!, { r3, r6, r7, r8, r9, r10, r11, r12 }
+	ldmia	r0!, { r3, r6, r7, r8, r9, r10, r11, r12 }
+	ldmia	r0!, { r3, r6, r7, r8, r9, r10, r11, r12 }
+	ldmia	r0!, { r3, r6, r7, r8, r9, r10, r11, r12 }
+	ldmia	r0!, { r3, r6, r7, r8, r9, r10, r11, r12 }
+
+	sub	r1, #256
+	cmp	r1, #0
+	bne	.L3
+
+	sub	r2, #1
+	cmp	r2, #0
+ 	bne	.L2
+
+# return.
+	ldmfd	sp!,{r4, r5, r6, r7, r8, r9, r10, r11, r12, pc}
+
+#-----------------------------------------------------------------------------
+# Name: 	RandomWriter
+# Purpose:	Performs random write into memory, as fast as possible.
+# Params:
+# 	r0 = pointer to array of chunk pointers
+# 	r1 = # of 256-byte chunks
+# 	r2 = # loops to do
+# 	r3 = value to write
+#-----------------------------------------------------------------------------
+RandomWriter:
+	stmfd	sp!,{r4, r5, lr}
+
+# r4 = temp
+# r5 = temp
+
+.L4:
+	mov	r5, #0
+
+.L5:
+# Get pointer to chunk in memory.
+	ldr	r4, [r0, r5, LSL #2]
+
+# Does 64 transfers, 4 bytes each = 256 bytes total.
+
+	str	r3, [r4, #160]
+	str	r3, [r4, #232]
+	str	r3, [r4, #224]
+	str	r3, [r4, #96]
+	str	r3, [r4, #164]
+	str	r3, [r4, #76]
+	str	r3, [r4, #100]
+	str	r3, [r4, #220]
+	str	r3, [r4, #248]
+	str	r3, [r4, #104]
+	str	r3, [r4, #4]
+	str	r3, [r4, #136]
+	str	r3, [r4, #112]
+	str	r3, [r4, #200]
+	str	r3, [r4, #12]
+	str	r3, [r4, #128]
+	str	r3, [r4, #148]
+	str	r3, [r4, #196]
+	str	r3, [r4, #216]
+	str	r3, [r4]
+	str	r3, [r4, #84]
+	str	r3, [r4, #140]
+	str	r3, [r4, #204]
+	str	r3, [r4, #184]
+	str	r3, [r4, #124]
+	str	r3, [r4, #48]
+	str	r3, [r4, #64]
+	str	r3, [r4, #212]
+	str	r3, [r4, #240]
+	str	r3, [r4, #236]
+	str	r3, [r4, #24]
+	str	r3, [r4, #252]
+	str	r3, [r4, #68]
+	str	r3, [r4, #20]
+	str	r3, [r4, #72]
+	str	r3, [r4, #32]
+	str	r3, [r4, #28]
+	str	r3, [r4, #52]
+	str	r3, [r4, #244]
+	str	r3, [r4, #180]
+	str	r3, [r4, #80]
+	str	r3, [r4, #60]
+	str	r3, [r4, #8]
+	str	r3, [r4, #56]
+	str	r3, [r4, #208]
+	str	r3, [r4, #228]
+	str	r3, [r4, #40]
+	str	r3, [r4, #172]
+	str	r3, [r4, #120]
+	str	r3, [r4, #176]
+	str	r3, [r4, #108]
+	str	r3, [r4, #132]
+	str	r3, [r4, #16]
+	str	r3, [r4, #44]
+	str	r3, [r4, #92]
+	str	r3, [r4, #168]
+	str	r3, [r4, #152]
+	str	r3, [r4, #156]
+	str	r3, [r4, #188]
+	str	r3, [r4, #36]
+	str	r3, [r4, #88]
+	str	r3, [r4, #116]
+	str	r3, [r4, #192]
+	str	r3, [r4, #144]
+
+	add	r5, #1
+	cmp	r5, r1
+	bne	.L5
+
+	sub	r2, #1
+	cmp	r2, #0
+	bne	.L4
+
+# return.
+	ldmfd	sp!,{r4, r5, pc}
+
+#-----------------------------------------------------------------------------
+# Name: 	RandomReader
+# Purpose:	Performs random reads from memory, as fast as possible.
+# Params:
+# 	r0 = pointer to array of chunk pointers
+# 	r1 = # of 256-byte chunks
+# 	r2 = # loops to do
+#-----------------------------------------------------------------------------
+RandomReader:
+	stmfd	sp!,{r4, r5, lr}
+
+# r3 = temp
+# r4 = temp
+# r5 = temp
+
+.L6:
+	mov	r5, #0
+
+.L7:
+# Get pointer to chunk in memory.
+	ldr	r4, [r0, r5, LSL #2]
+
+# Does 64 transfers, 4 bytes each = 256 bytes total.
+
+	ldr	r3, [r4, #160]
+	ldr	r3, [r4, #232]
+	ldr	r3, [r4, #224]
+	ldr	r3, [r4, #96]
+	ldr	r3, [r4, #164]
+	ldr	r3, [r4, #76]
+	ldr	r3, [r4, #100]
+	ldr	r3, [r4, #220]
+	ldr	r3, [r4, #248]
+	ldr	r3, [r4, #104]
+	ldr	r3, [r4, #4]
+	ldr	r3, [r4, #136]
+	ldr	r3, [r4, #112]
+	ldr	r3, [r4, #200]
+	ldr	r3, [r4, #12]
+	ldr	r3, [r4, #128]
+	ldr	r3, [r4, #148]
+	ldr	r3, [r4, #196]
+	ldr	r3, [r4, #216]
+	ldr	r3, [r4]
+	ldr	r3, [r4, #84]
+	ldr	r3, [r4, #140]
+	ldr	r3, [r4, #204]
+	ldr	r3, [r4, #184]
+	ldr	r3, [r4, #124]
+	ldr	r3, [r4, #48]
+	ldr	r3, [r4, #64]
+	ldr	r3, [r4, #212]
+	ldr	r3, [r4, #240]
+	ldr	r3, [r4, #236]
+	ldr	r3, [r4, #24]
+	ldr	r3, [r4, #252]
+	ldr	r3, [r4, #68]
+	ldr	r3, [r4, #20]
+	ldr	r3, [r4, #72]
+	ldr	r3, [r4, #32]
+	ldr	r3, [r4, #28]
+	ldr	r3, [r4, #52]
+	ldr	r3, [r4, #244]
+	ldr	r3, [r4, #180]
+	ldr	r3, [r4, #80]
+	ldr	r3, [r4, #60]
+	ldr	r3, [r4, #8]
+	ldr	r3, [r4, #56]
+	ldr	r3, [r4, #208]
+	ldr	r3, [r4, #228]
+	ldr	r3, [r4, #40]
+	ldr	r3, [r4, #172]
+	ldr	r3, [r4, #120]
+	ldr	r3, [r4, #176]
+	ldr	r3, [r4, #108]
+	ldr	r3, [r4, #132]
+	ldr	r3, [r4, #16]
+	ldr	r3, [r4, #44]
+	ldr	r3, [r4, #92]
+	ldr	r3, [r4, #168]
+	ldr	r3, [r4, #152]
+	ldr	r3, [r4, #156]
+	ldr	r3, [r4, #188]
+	ldr	r3, [r4, #36]
+	ldr	r3, [r4, #88]
+	ldr	r3, [r4, #116]
+	ldr	r3, [r4, #192]
+	ldr	r3, [r4, #144]
+
+	add	r5, #1
+	cmp	r5, r1
+	bne	.L7
+
+	sub	r2, #1
+	cmp	r2, #0
+	bne	.L6
+
+# return.
+	ldmfd	sp!,{r4, r5, pc}
+
+#-----------------------------------------------------------------------------
+# Name: 	RegisterToRegister
+# Purpose:	Performs register-to-register transfers.
+# Params:
+#	r0 = count
+#-----------------------------------------------------------------------------
+RegisterToRegister:
+	stmfd	sp!,{lr}
+
+# r1 = temp
+
+.L8:
+# Does 64 transfers, 4 bytes each = 256 bytes total.
+
+	mov	r1, r2
+	mov	r1, r3
+	mov	r1, r4
+	mov	r1, r5
+	mov	r1, r6
+	mov	r1, r7
+	mov	r1, r8
+	mov	r1, r9
+
+	mov	r2, r1
+	mov	r2, r3
+	mov	r2, r4
+	mov	r2, r5
+	mov	r2, r6
+	mov	r2, r7
+	mov	r2, r8
+	mov	r2, r9
+
+	mov	r1, r2
+	mov	r1, r3
+	mov	r1, r4
+	mov	r1, r5
+	mov	r1, r6
+	mov	r1, r7
+	mov	r1, r8
+	mov	r1, r9
+
+	mov	r1, r2
+	mov	r1, r3
+	mov	r1, r4
+	mov	r1, r5
+	mov	r1, r6
+	mov	r1, r7
+	mov	r1, r8
+	mov	r1, r9
+
+	mov	r1, r2
+	mov	r1, r3
+	mov	r1, r4
+	mov	r1, r5
+	mov	r1, r6
+	mov	r1, r7
+	mov	r1, r8
+	mov	r1, r9
+
+	mov	r1, r2
+	mov	r1, r3
+	mov	r1, r4
+	mov	r1, r5
+	mov	r1, r6
+	mov	r1, r7
+	mov	r1, r8
+	mov	r1, r9
+
+	mov	r1, r2
+	mov	r1, r3
+	mov	r1, r4
+	mov	r1, r5
+	mov	r1, r6
+	mov	r1, r7
+	mov	r1, r8
+	mov	r1, r9
+
+	mov	r1, r2
+	mov	r1, r3
+	mov	r1, r4
+	mov	r1, r5
+	mov	r1, r6
+	mov	r1, r7
+	mov	r1, r8
+	mov	r1, r9
+
+	sub	r0, #1
+	cmp	r0, #0
+	bne	.L8
+
+# return.
+	ldmfd	sp!,{pc}
+
+#-----------------------------------------------------------------------------
+# Name: 	StackReader
+# Purpose:	Performs stack-to-register transfers.
+# Params:
+#	r0 = count
+#-----------------------------------------------------------------------------
+StackReader:
+	stmfd	sp!,{lr}
+
+# r1 = temp
+
+	sub	sp, #32
+.L9:
+# Does 64 transfers, 4 bytes each = 256 bytes total.
+
+	ldr	r1, [sp]
+	ldr	r1, [sp, #4]
+	ldr	r1, [sp, #8]
+	ldr	r1, [sp, #12]
+	ldr	r1, [sp, #16]
+	ldr	r1, [sp, #20]
+	ldr	r1, [sp, #24]
+	ldr	r1, [sp, #28]
+
+	ldr	r1, [sp]
+	ldr	r1, [sp, #4]
+	ldr	r1, [sp, #8]
+	ldr	r1, [sp, #12]
+	ldr	r1, [sp, #16]
+	ldr	r1, [sp, #20]
+	ldr	r1, [sp, #24]
+	ldr	r1, [sp, #28]
+
+	ldr	r1, [sp]
+	ldr	r1, [sp, #4]
+	ldr	r1, [sp, #8]
+	ldr	r1, [sp, #12]
+	ldr	r1, [sp, #16]
+	ldr	r1, [sp, #20]
+	ldr	r1, [sp, #24]
+	ldr	r1, [sp, #28]
+
+	ldr	r1, [sp]
+	ldr	r1, [sp, #4]
+	ldr	r1, [sp, #8]
+	ldr	r1, [sp, #12]
+	ldr	r1, [sp, #16]
+	ldr	r1, [sp, #20]
+	ldr	r1, [sp, #24]
+	ldr	r1, [sp, #28]
+
+	ldr	r1, [sp]
+	ldr	r1, [sp, #4]
+	ldr	r1, [sp, #8]
+	ldr	r1, [sp, #12]
+	ldr	r1, [sp, #16]
+	ldr	r1, [sp, #20]
+	ldr	r1, [sp, #24]
+	ldr	r1, [sp, #28]
+
+	ldr	r1, [sp]
+	ldr	r1, [sp, #4]
+	ldr	r1, [sp, #8]
+	ldr	r1, [sp, #12]
+	ldr	r1, [sp, #16]
+	ldr	r1, [sp, #20]
+	ldr	r1, [sp, #24]
+	ldr	r1, [sp, #28]
+
+	ldr	r1, [sp]
+	ldr	r1, [sp, #4]
+	ldr	r1, [sp, #8]
+	ldr	r1, [sp, #12]
+	ldr	r1, [sp, #16]
+	ldr	r1, [sp, #20]
+	ldr	r1, [sp, #24]
+	ldr	r1, [sp, #28]
+
+	ldr	r1, [sp]
+	ldr	r1, [sp, #4]
+	ldr	r1, [sp, #8]
+	ldr	r1, [sp, #12]
+	ldr	r1, [sp, #16]
+	ldr	r1, [sp, #20]
+	ldr	r1, [sp, #24]
+	ldr	r1, [sp, #28]
+
+	sub	r0, #1
+	cmp	r0, #0
+	bne	.L9
+
+	add	sp, #32
+
+# return.
+	ldmfd	sp!,{pc}
+
+#-----------------------------------------------------------------------------
+# Name: 	StackWriter
+# Purpose:	Performs register-to-stack transfers.
+# Params:
+#	r0 = count
+#-----------------------------------------------------------------------------
+StackWriter:
+	stmfd	sp!,{lr}
+
+# r1 = temp
+
+	sub	sp, #32
+.L10:
+# Does 64 transfers, 4 bytes each = 256 bytes total.
+
+	str	r1, [sp]
+	str	r1, [sp, #4]
+	str	r1, [sp, #8]
+	str	r1, [sp, #12]
+	str	r1, [sp, #16]
+	str	r1, [sp, #20]
+	str	r1, [sp, #24]
+	str	r1, [sp, #28]
+
+	str	r1, [sp]
+	str	r1, [sp, #4]
+	str	r1, [sp, #8]
+	str	r1, [sp, #12]
+	str	r1, [sp, #16]
+	str	r1, [sp, #20]
+	str	r1, [sp, #24]
+	str	r1, [sp, #28]
+
+	str	r1, [sp]
+	str	r1, [sp, #4]
+	str	r1, [sp, #8]
+	str	r1, [sp, #12]
+	str	r1, [sp, #16]
+	str	r1, [sp, #20]
+	str	r1, [sp, #24]
+	str	r1, [sp, #28]
+
+	str	r1, [sp]
+	str	r1, [sp, #4]
+	str	r1, [sp, #8]
+	str	r1, [sp, #12]
+	str	r1, [sp, #16]
+	str	r1, [sp, #20]
+	str	r1, [sp, #24]
+	str	r1, [sp, #28]
+
+	str	r1, [sp]
+	str	r1, [sp, #4]
+	str	r1, [sp, #8]
+	str	r1, [sp, #12]
+	str	r1, [sp, #16]
+	str	r1, [sp, #20]
+	str	r1, [sp, #24]
+	str	r1, [sp, #28]
+
+	str	r1, [sp]
+	str	r1, [sp, #4]
+	str	r1, [sp, #8]
+	str	r1, [sp, #12]
+	str	r1, [sp, #16]
+	str	r1, [sp, #20]
+	str	r1, [sp, #24]
+	str	r1, [sp, #28]
+
+	str	r1, [sp]
+	str	r1, [sp, #4]
+	str	r1, [sp, #8]
+	str	r1, [sp, #12]
+	str	r1, [sp, #16]
+	str	r1, [sp, #20]
+	str	r1, [sp, #24]
+	str	r1, [sp, #28]
+
+	str	r1, [sp]
+	str	r1, [sp, #4]
+	str	r1, [sp, #8]
+	str	r1, [sp, #12]
+	str	r1, [sp, #16]
+	str	r1, [sp, #20]
+	str	r1, [sp, #24]
+	str	r1, [sp, #28]
+
+	sub	r0, #1
+	cmp	r0, #0
+	bne	.L10
+
+	add	sp, #32
+
+# return.
+	ldmfd	sp!,{pc}
+
diff --git a/routinesARM64.S b/routinesARM64.S
new file mode 100644
index 0000000..fa1f8b7
--- /dev/null
+++ b/routinesARM64.S
@@ -0,0 +1,483 @@
+
+# ============================================================================
+#  bandwidth 0.23, a benchmark to estimate memory transfer bandwidth.
+#  ARM assembly module.
+#  Copyright (C) 2010 by Zack T Smith.
+#
+#  This program is free software; you can redistribute it and/or modify
+#  it under the terms of the GNU General Public License as published by
+#  the Free Software Foundation; either version 2 of the License, or
+#  (at your option) any later version.
+#
+#  This program is distributed in the hope that it will be useful,
+#  but WITHOUT ANY WARRANTY; without even the implied warranty of
+#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#  GNU General Public License for more details.
+#
+#  You should have received a copy of the GNU General Public License
+#  along with this program; if not, write to the Free Software
+#  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+#
+#  The author may be reached at fbui@comcast.net.
+# =============================================================================
+
+#--------------
+# Version 0.7
+#--------------
+
+#include "config.h"
+
+.arch armv8-a
+
+#ifdef __thumb2__
+.syntax unified
+.code 16
+#endif
+
+.section code
+
+.text
+.align 2
+
+.global Writer
+.global RandomWriter
+
+.global Reader
+.global RandomReader
+
+.global	RegisterToRegister
+.global	StackReader
+.global	StackWriter
+
+#-----------------------------------------------------------------------------
+# Name: 	Writer
+# Purpose:	Performs sequential write into memory, as fast as possible.
+# Params:
+#	x0 = address
+#	x1 = length, multiple of 256
+#	x2 = loop
+# 	x3 = value to write
+#-----------------------------------------------------------------------------
+Writer:
+	stp	x29, x30, [sp, #-16]!
+
+	bic	x1, x1, #0x7f
+	mov	x4, x0
+	mov	x5, x1
+
+	mov	x6, x3
+
+# x4 = temp address
+# x5 = temp length
+
+.L0:
+	mov	x0, x4
+	mov	x1, x5
+
+.L1:
+	stp x3, x6, [x0]
+	stp x3, x6, [x0, #16]
+	stp x3, x6, [x0, #32]
+	stp x3, x6, [x0, #48]
+	stp x3, x6, [x0, #64]
+	stp x3, x6, [x0, #80]
+	stp x3, x6, [x0, #96]
+	stp x3, x6, [x0, #112]
+	stp x3, x6, [x0, #128]
+	stp x3, x6, [x0, #144]
+	stp x3, x6, [x0, #160]
+	stp x3, x6, [x0, #176]
+	stp x3, x6, [x0, #192]
+	stp x3, x6, [x0, #208]
+	stp x3, x6, [x0, #224]
+	stp x3, x6, [x0, #240]
+	add x0, x0, #256
+
+	sub	x1, x1, #256
+	cbnz	x1, .L1
+
+	sub	x2, x2, #1
+	cbnz	x2, .L0
+
+# return.
+	ldp x29, x30, [sp], #16
+	ret
+
+#-----------------------------------------------------------------------------
+# Name: 	Reader
+# Purpose:	Performs sequential reads from memory, as fast as possible.
+# Params:
+#	x0 = address
+#	x1 = length, multiple of 256
+#	x2 = loop
+#-----------------------------------------------------------------------------
+Reader:
+	stp	x29, x30, [sp, #-16]!
+	stp	x20, x21, [sp, #-16]!
+	stp	x18, x19, [sp, #-16]!
+	stp	x16, x17, [sp, #-16]!
+	stp	x14, x15, [sp, #-16]!
+	stp	x12, x13, [sp, #-16]!
+	stp	x10, x11, [sp, #-16]!
+	stp	x8, x9, [sp, #-16]!
+
+	bic	x1, x1, #0x7f
+	mov	x4, x0
+	mov	x5, x1
+
+# x4 = temp address
+# x5 = temp length
+
+.L2:
+	mov	x0, x4
+	mov	x1, x5
+
+.L3:
+	ldp x3, x6, [x0]
+	ldp x7, x8, [x0, #16]
+	ldp x9, x10, [x0, #32]
+	ldp x11, x12, [x0, #48]
+	ldp x13, x14, [x0, #64]
+	ldp x15, x16, [x0, #80]
+	ldp x17, x18, [x0, #96]
+	ldp x19, x20, [x0, #112]
+	ldp x21, x6, [x0, #128]
+	ldp x7, x8, [x0, #144]
+	ldp x9, x10, [x0, #160]
+	ldp x11, x12, [x0, #176]
+	ldp x13, x14, [x0, #192]
+	ldp x15, x16, [x0, #208]
+	ldp x17, x18, [x0, #224]
+	ldp x19, x20, [x0, #240]
+	add x0, x0, #256
+
+	sub	x1, x1, #256
+	cbnz	x1, .L3
+
+	sub	x2, x2, #1
+	cbnz	x2, .L2
+
+# return.
+	ldp x8, x9, [sp], #16
+	ldp x10, x11, [sp], #16
+	ldp x12, x13, [sp], #16
+	ldp x14, x15, [sp], #16
+	ldp x16, x17, [sp], #16
+	ldp x18, x19, [sp], #16
+	ldp x20, x21, [sp], #16
+	ldp x29, x30, [sp], #16
+	ret
+
+#-----------------------------------------------------------------------------
+# Name: 	RandomWriter
+# Purpose:	Performs random write into memory, as fast as possible.
+# Params:
+# 	x0 = pointer to array of chunk pointers
+# 	x1 = # of 256-byte chunks
+# 	x2 = # loops to do
+# 	x3 = value to write
+#-----------------------------------------------------------------------------
+RandomWriter:
+	stp	x29, x30, [sp, #-16]!
+
+# x4 = temp
+# x5 = temp
+
+.L4:
+	mov	x5, #0
+
+.L5:
+# Get pointer to chunk in memory. Note, 64-bit pointers.
+	ldr	x4, [x0, x5, LSL #3]
+
+# Does 32 transfers, 8 bytes each = 256 bytes total.
+
+	str	x3, [x4, #160]
+	str	x3, [x4, #232]
+	str	x3, [x4, #224]
+	str	x3, [x4, #96]
+	str	x3, [x4, #168]
+	str	x3, [x4, #80]
+	str	x3, [x4, #104]
+	str	x3, [x4, #248]
+	str	x3, [x4, #8]
+	str	x3, [x4, #136]
+	str	x3, [x4, #112]
+	str	x3, [x4, #200]
+	str	x3, [x4, #128]
+	str	x3, [x4, #152]
+	str	x3, [x4, #216]
+	str	x3, [x4]
+	str	x3, [x4, #88]
+	str	x3, [x4, #144]
+	str	x3, [x4, #208]
+	str	x3, [x4, #184]
+	str	x3, [x4, #48]
+	str	x3, [x4, #64]
+	str	x3, [x4, #240]
+	str	x3, [x4, #24]
+	str	x3, [x4, #72]
+	str	x3, [x4, #32]
+	str	x3, [x4, #56]
+	str	x3, [x4, #16]
+	str	x3, [x4, #40]
+	str	x3, [x4, #176]
+	str	x3, [x4, #120]
+	str	x3, [x4, #192]
+
+	add	x5, x5, #1
+	cmp	x5, x1
+	bne	.L5
+
+	sub	x2, x2, #1
+	cbnz	x2, .L4
+
+# return.
+	ldp x29, x30, [sp], #16
+	ret
+
+#-----------------------------------------------------------------------------
+# Name: 	RandomReader
+# Purpose:	Performs random reads from memory, as fast as possible.
+# Params:
+# 	x0 = pointer to array of chunk pointers
+# 	x1 = # of 256-byte chunks
+# 	x2 = # loops to do
+#-----------------------------------------------------------------------------
+RandomReader:
+	stp	x29, x30, [sp, #-16]!
+
+# x4 = temp
+# x5 = temp
+
+.L6:
+	mov	x5, #0
+
+.L7:
+# Get pointer to chunk in memory. Note, 64-bit pointers.
+	ldr	x4, [x0, x5, LSL #3]
+
+# Does 32 transfers, 8 bytes each = 256 bytes total.
+
+	ldr	x3, [x4, #160]
+	ldr	x3, [x4, #232]
+	ldr	x3, [x4, #224]
+	ldr	x3, [x4, #96]
+	ldr	x3, [x4, #168]
+	ldr	x3, [x4, #80]
+	ldr	x3, [x4, #104]
+	ldr	x3, [x4, #248]
+	ldr	x3, [x4, #8]
+	ldr	x3, [x4, #136]
+	ldr	x3, [x4, #112]
+	ldr	x3, [x4, #200]
+	ldr	x3, [x4, #128]
+	ldr	x3, [x4, #152]
+	ldr	x3, [x4, #216]
+	ldr	x3, [x4]
+	ldr	x3, [x4, #88]
+	ldr	x3, [x4, #144]
+	ldr	x3, [x4, #208]
+	ldr	x3, [x4, #184]
+	ldr	x3, [x4, #48]
+	ldr	x3, [x4, #64]
+	ldr	x3, [x4, #240]
+	ldr	x3, [x4, #24]
+	ldr	x3, [x4, #72]
+	ldr	x3, [x4, #32]
+	ldr	x3, [x4, #56]
+	ldr	x3, [x4, #16]
+	ldr	x3, [x4, #40]
+	ldr	x3, [x4, #176]
+	ldr	x3, [x4, #120]
+	ldr	x3, [x4, #192]
+
+	add	x5, x5, #1
+	cmp	x5, x1
+	bne	.L7
+
+	sub	x2, x2, #1
+	cbnz	x2, .L6
+
+# return.
+	ldp x29, x30, [sp], #16
+	ret
+
+#-----------------------------------------------------------------------------
+# Name: 	RegisterToRegister
+# Purpose:	Performs register-to-register transfers.
+# Params:
+#	x0 = count
+#-----------------------------------------------------------------------------
+RegisterToRegister:
+	stp	x29, x30, [sp, #-16]!
+
+# x1 = temp
+
+.L8:
+# Does 32 transfers, 8 bytes each = 256 bytes total.
+
+	mov	x1, x2
+	mov	x1, x3
+	mov	x1, x4
+	mov	x1, x5
+	mov	x1, x6
+	mov	x1, x7
+	mov	x1, x8
+	mov	x1, x9
+
+	mov	x2, x1
+	mov	x2, x3
+	mov	x2, x4
+	mov	x2, x5
+	mov	x2, x6
+	mov	x2, x7
+	mov	x2, x8
+	mov	x2, x9
+
+	mov	x1, x2
+	mov	x1, x3
+	mov	x1, x4
+	mov	x1, x5
+	mov	x1, x6
+	mov	x1, x7
+	mov	x1, x8
+	mov	x1, x9
+
+	mov	x1, x2
+	mov	x1, x3
+	mov	x1, x4
+	mov	x1, x5
+	mov	x1, x6
+	mov	x1, x7
+	mov	x1, x8
+	mov	x1, x9
+
+
+	sub	x0, x0, #1
+	cbnz	x0, .L8
+
+# return.
+	ldp x29, x30, [sp], #16
+	ret
+
+#-----------------------------------------------------------------------------
+# Name: 	StackReader
+# Purpose:	Performs stack-to-register transfers.
+# Params:
+#	x0 = count
+#-----------------------------------------------------------------------------
+StackReader:
+	stp	x29, x30, [sp, #-16]!
+
+# x1 = temp
+
+	sub	sp, sp, #64
+.L9:
+# Does 32 transfers, 8 bytes each = 256 bytes total.
+
+	ldr	x1, [sp]
+	ldr	x1, [sp, #8]
+	ldr	x1, [sp, #16]
+	ldr	x1, [sp, #24]
+	ldr	x1, [sp, #32]
+	ldr	x1, [sp, #40]
+	ldr	x1, [sp, #48]
+	ldr	x1, [sp, #56]
+
+	ldr	x1, [sp]
+	ldr	x1, [sp, #8]
+	ldr	x1, [sp, #16]
+	ldr	x1, [sp, #24]
+	ldr	x1, [sp, #32]
+	ldr	x1, [sp, #40]
+	ldr	x1, [sp, #48]
+	ldr	x1, [sp, #56]
+
+	ldr	x1, [sp]
+	ldr	x1, [sp, #8]
+	ldr	x1, [sp, #16]
+	ldr	x1, [sp, #24]
+	ldr	x1, [sp, #32]
+	ldr	x1, [sp, #40]
+	ldr	x1, [sp, #48]
+	ldr	x1, [sp, #56]
+
+	ldr	x1, [sp]
+	ldr	x1, [sp, #8]
+	ldr	x1, [sp, #16]
+	ldr	x1, [sp, #24]
+	ldr	x1, [sp, #32]
+	ldr	x1, [sp, #40]
+	ldr	x1, [sp, #48]
+	ldr	x1, [sp, #56]
+
+	sub	x0, x0, #1
+	cbnz	x0, .L9
+
+	add	sp, sp, #64
+
+# return.
+	ldp x29, x30, [sp], #16
+	ret
+
+#-----------------------------------------------------------------------------
+# Name: 	StackWriter
+# Purpose:	Performs register-to-stack transfers.
+# Params:
+#	x0 = count
+#-----------------------------------------------------------------------------
+StackWriter:
+	stp	x29, x30, [sp, #-16]!
+
+# x1 = temp
+
+	sub	sp, sp, #64
+.L10:
+# Does 32 transfers, 8 bytes each = 256 bytes total.
+
+	str	x1, [sp]
+	str	x1, [sp, #8]
+	str	x1, [sp, #16]
+	str	x1, [sp, #24]
+	str	x1, [sp, #32]
+	str	x1, [sp, #40]
+	str	x1, [sp, #48]
+	str	x1, [sp, #56]
+
+	str	x1, [sp]
+	str	x1, [sp, #8]
+	str	x1, [sp, #16]
+	str	x1, [sp, #24]
+	str	x1, [sp, #32]
+	str	x1, [sp, #40]
+	str	x1, [sp, #48]
+	str	x1, [sp, #56]
+
+	str	x1, [sp]
+	str	x1, [sp, #8]
+	str	x1, [sp, #16]
+	str	x1, [sp, #24]
+	str	x1, [sp, #32]
+	str	x1, [sp, #40]
+	str	x1, [sp, #48]
+	str	x1, [sp, #56]
+
+	str	x1, [sp]
+	str	x1, [sp, #8]
+	str	x1, [sp, #16]
+	str	x1, [sp, #24]
+	str	x1, [sp, #32]
+	str	x1, [sp, #40]
+	str	x1, [sp, #48]
+	str	x1, [sp, #56]
+
+	sub	x0, x0, #1
+	cbnz	x0, .L10
+
+	add	sp, sp, #64
+
+# return.
+	ldp x29, x30, [sp], #16
+	ret
+