From 4a83f67490136a898f558e273b76a687aed8b893 Mon Sep 17 00:00:00 2001
From: "Field G. Van Zee" <field@cs.utexas.edu>
Date: Mon, 17 Dec 2012 12:35:54 -0600
Subject: [PATCH] Consolidated configuration headers.

Details:
- Merged contents of bl2_arch.h into bl2_config.h for reference and
  clarksville configurations.
- Updated CREDITS, INSTALL, LICENSE, README files.
---
 CREDITS                         |   6 +-
 INSTALL                         |  76 +++++++-------
 LICENSE                         |   6 +-
 README                          |   2 +-
 config/clarksville/bl2_arch.h   | 173 -------------------------------
 config/clarksville/bl2_config.h | 174 +++++++++++++++++++++++++++++++-
 config/reference/bl2_arch.h     | 173 -------------------------------
 config/reference/bl2_config.h   | 174 +++++++++++++++++++++++++++++++-
 frame/include/blis2.h           |  10 +-
 version                         |   2 +-
 10 files changed, 389 insertions(+), 407 deletions(-)
 delete mode 100644 config/clarksville/bl2_arch.h
 delete mode 100644 config/reference/bl2_arch.h

diff --git a/CREDITS b/CREDITS
index 6138762bc7..483695820f 100644
--- a/CREDITS
+++ b/CREDITS
@@ -1,12 +1,13 @@
-Acknowledgements
 
+BLIS framework
+Acknowledgements
 ---
 
 The BLIS framework was primarily authored by
 
   Field Van Zee               (The University of Texas at Austin)
 
-but many others gave input and feedback as it was initially
+but many others contributed input and feedback as it was initially
 conceived, designed, and developed, including:
 
   John Gunnels                (IBM, T.J. Watson Research Center)
@@ -27,4 +28,3 @@ of BLIS to new architectures as proofs-of-concept:
 BLIS's development was partially funded by grants from Microsoft and
 the National Science Foundation (Awards CCF-0917167 and OCI-1148125).
 
-  
diff --git a/INSTALL b/INSTALL
index 1d04ea6b8b..c7d7466fc6 100644
--- a/INSTALL
+++ b/INSTALL
@@ -37,33 +37,28 @@ When creating your configuration sub-directory, you can use the reference
 configuration as a template:
 
   > ls config/reference
-  bl2_arch.h  bl2_config.h  bl2_kernel.h  make_defs.mk
+  bl2_config.h  bl2_kernel.h  make_defs.mk
   > cp -r config/reference config/x86-64opt
   > ls config/x86-64opt
-  bl2_arch.h  bl2_config.h  bl2_kernel.h  make_defs.mk
+  bl2_config.h  bl2_kernel.h  make_defs.mk
 
 Then you would edit each of these four files. Here are some special notes
 about each file:
 
-  o bl2_arch.h
-    - This is where you specify architectural details such as register and
-      cache blocksizes used by the micro-kernel(s).
-
   o bl2_config.h
-    - This file is where you could place C preprocessor macros that would
-      typically be defined by the arguments to an autoconf-style configure
-      script. It contains just a few basic definitions such as whether to
-      use memory alignment and, if so, what boundary multiple to use.
+    - This is where you specify most of the parameters of your BLIS
+      configuration, including (but not limited to) such things as register
+      and cache blocksizes used by the micro-kernel(s).
 
   o bl2_kernel.h
     - This file defines C preprocessor macros associated with various kernels
       and micro-kernels. The reference configuration defines all kernels to
       use reference implementations (which are provided as part of the BLIS
       framework). If you end up writing your own optimized kernel for some
-      operation, be sure and set it here. Notice that you only have to set
-      ONE definition for each operation, as BLIS prepends s,d,c,z (as well as
-      the BLIS function prefix, currently 'bl2_') to the names to create the
-      actual datatype instances.
+      operation, be sure and enable its use here. Notice that you only have to
+      set ONE definition for each operation, as BLIS prepends s,d,c,z (as well
+      as the BLIS function prefix, currently 'bl2_') to the names to create
+      the actual datatype instances.
     - IMPORTANT: If you add your own kernels, OR if you use kernels provided
       with the BLIS framework distribution (i.e., in the 'kernels' directory),
       you MUST add a symbolic link to those kernels to the configuration
@@ -74,11 +69,12 @@ about each file:
         > pwd
         /home/field/google_code/blis/config/x86-64opt
         > ls
-        bl2_arch.h  bl2_config.h  bl2_kernel.h  make_defs.mk
+        bl2_config.h  bl2_kernel.h  make_defs.mk
         > ls ../../kernels
         x86  x86_64
         > ln -s ../../kernels/x86_64 kernels
-        > ls bl2_arch.h  bl2_config.h  bl2_kernel.h  kernels  make_defs.mk
+        > ls
+        bl2_config.h  bl2_kernel.h  kernels  make_defs.mk
         > ls -l kernels
         lrwxrwxrwx 1 field dept 20 Dec  1 18:13 kernels -> ../../kernels/x86_64
 
@@ -303,52 +299,51 @@ ever unsure which configuration is "active", simply run:
 
 This will tell you which configuration is specified by the config.mk file.
 
-Another benefit to switching configuration is that a previous configuration's
+Another benefit to configuration switching is that a previous configuration's
 object files are saved from the previous build. For example, suppose you
 configure a configuration named 'debug'. You run 'make' followed by 'make
 install'. You use that debug-enabled library for a while and then you create
 a configuration named 'opt', which is the same configuration in optimized
 form. You compile, install, and test that library. But then you want to return
 to developing with the 'debug' configuration. Simply run './configure debug'
-again to switch to the 'debug' configuration. If haven't run any of the
-'clean' targets in the interim, then the previously created object files for
-the 'debug' configuration will still be there (inside './obj/debug'). If you
+again to switch to the 'debug' configuration. If you haven't run any of the
+'clean' targets in the interim, the previously created object files for the
+'debug' configuration will still be there (inside './obj/debug'). If you
 change a .c source file (not a header file; see "Caveats" section below), you
 won't have to recompile the whole library. Rather, only the source files that
 changed since the previous configuration and compilation will need to be
-recompiled, because the object files created by the initial 'debug' build (and
-their modification times) are preserved even after you switch away to another
-configuration.
+recompiled.
 
 Now, while the object files are "remembered" from previous configurations,
 the library archives (and headers) are not. So, 'make install' will always
 re-install the build products after returning to a previous configuration,
 even if no source files (or header files) changed. We must re-install the
 libraries unconditionally after switching back to a previous configuration
-because while the configuration name might be the same, the installation
-prefix might have changed. Thus, we must install the build products just
-in case. But the installation itself is a relatively low overhead task,
-and so shouldn't be much of a burden to a developer who switches his or her
-configurations.
+because while the configuration name (e.g., 'debug' or 'opt') might be the
+same, the installation prefix might have changed. Thus, we must install the
+build products just in case. But the installation itself is a relatively
+low overhead task, and so shouldn't be much of a burden to a developer who
+switches his or her configurations frequently.
 
 -- Caveats --
 
-Due to the way the BLIS framework handles header files, any change to any
-header file will result in the entire library being rebuilt. This policty is
-mostly out of an abundance of caution. If two or more files use definitions in
-a header that is modified, and one or more of those files somehow does not get
-recompiled to reflect the updated definitions, you could end up sinking hours
-of time trying to track down a bug that didn't ever need to be an issue to
-begin with. Thus, to prevent developers from shooting themselves in the foot
-with this problem, the BLIS build system recompiles *all* object files if
-a header file--any header file--is touched.
+Due to the way the BLIS framework handles header files, *any* change to *any*
+header file will result in the entire library being rebuilt. This policy is
+in place mostly out of an abundance of caution. If two or more files use
+definitions in a header that is modified, and one or more of those files
+somehow does not get recompiled to reflect the updated definitions, you could
+end up sinking hours of time trying to track down a bug that didn't ever need
+to be an issue to begin with. Thus, to prevent developers (including the
+framework developer(s)) from shooting themselves in the foot with this
+problem, the BLIS build system recompiles *all* object files if a header
+file--any header file--is touched.
 
 
 CONCLUSION
 
-That's it! The BLIS framework's build system adheres to the familiar
-"./configure; make ; make install" build process that many of us are
-used to.
+That's it! While the BLIS framework's build system has some nice features, it
+essentially adheres to the familiar "./configure; make ; make install" build
+process that many of us are used to.
 
 If you have feedback, please consider keeping in touch with the project
 maintainers, contributors, and other users by joining and participating
@@ -358,4 +353,3 @@ Thanks for using BLIS!
 
 Field Van Zee
 
-
diff --git a/LICENSE b/LICENSE
index fa748f0aa8..e06e9283ba 100644
--- a/LICENSE
+++ b/LICENSE
@@ -1,7 +1,11 @@
+
+BLIS framework
+License
+---
+
 The BLIS framework is licensed under the following license, typically
 known as the "new" or "modified" or "3-clause" BSD license.
 
----
 
 Copyright (C) 2012, The University of Texas
 
diff --git a/README b/README
index 125e830cbe..7ce5513dc5 100644
--- a/README
+++ b/README
@@ -8,7 +8,7 @@ Thank you for deciding to try out the BLIS framework!
 BLIS is a portable framework for instantiating BLAS-like libraries. The
 framework was designed to isolate essential kernels of computation that,
 when optimized, immediately enable optimized implementations of most of
-the commonly used and computationally intensive operations.
+its commonly used and computationally intensive operations.
 
 BLIS has many features. For more detailed information about the project,
 please check the BLIS homepage:
diff --git a/config/clarksville/bl2_arch.h b/config/clarksville/bl2_arch.h
deleted file mode 100644
index e390c2df37..0000000000
--- a/config/clarksville/bl2_arch.h
+++ /dev/null
@@ -1,173 +0,0 @@
-/*
-
-   BLIS    
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2012, The University of Texas
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name of The University of Texas nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#ifndef BLIS_ARCH_H
-#define BLIS_ARCH_H
-
-// -- General floating-point constants --
-
-#define BLIS_NUM_FP_TYPES              4
-#define BLIS_MAX_TYPE_SIZE             sizeof(dcomplex)
-
-
-// -- Maximum offset of an element that might be pre-loaded/prefetched --
-
-#define BLIS_MAX_PREFETCH_BYTE_OFFSET  128
-
-
-// -- Page size --
-
-#define BLIS_PAGE_SIZE                 4096
-
-
-// -- Number of elements per vector register --
-
-#define BLIS_NUM_ELEM_PER_REG_S        4
-#define BLIS_NUM_ELEM_PER_REG_D        2
-#define BLIS_NUM_ELEM_PER_REG_C        2
-#define BLIS_NUM_ELEM_PER_REG_Z        1
-
-
-// -- Default fusing factors for level-1 fused operations --
-
-#define BLIS_DEFAULT_FUSING_FACTOR_S   8
-#define BLIS_DEFAULT_FUSING_FACTOR_D   4
-#define BLIS_DEFAULT_FUSING_FACTOR_C   4
-#define BLIS_DEFAULT_FUSING_FACTOR_Z   2
-
-
-// -- Default cache blocksizes --
-
-// Constraints:
-//
-// (1) MC must be a multiple of:
-//     (a) MR (for zero-padding purposes) and
-//     (b) NR.
-// (2) NC must be a multiple of
-//     (a) NR (for zero-padding purposes) and
-//     (b) MR.
-// (3) KC does not need to be multiple of anything, unless the micro-kernel
-//     specifically requires it (and typically it does not).
-// 
-// NOTE: For BLIS libraries built on block-panel macro-kernels, constraint
-// (2b) is relaxed. In this case, (1b) is needed for operation implementations
-// involving matrices with diagonals (trmm, trsm). In these cases, we want the
-// diagonal offset of any panel of packed matrix A to have a diagonal offset
-// that is a multiple of MR. If, instead, the library were to be built on
-// block-panel macro-kernels, matrix B would be the one with structure, not A,
-// and thus it would be constraint (2b) that would be needed instead of (1b).
-//
-
-#define BLIS_DEFAULT_MC_S              128
-#define BLIS_DEFAULT_KC_S              256
-#define BLIS_DEFAULT_NC_S              8192
-
-#define BLIS_DEFAULT_MC_D              368
-#define BLIS_DEFAULT_KC_D              256
-#define BLIS_DEFAULT_NC_D              8192
-
-#define BLIS_DEFAULT_MC_C              128
-#define BLIS_DEFAULT_KC_C              256
-#define BLIS_DEFAULT_NC_C              8192
-
-#define BLIS_DEFAULT_MC_Z              128
-#define BLIS_DEFAULT_KC_Z              256
-#define BLIS_DEFAULT_NC_Z              8192
-
-
-// -- Default register blocksizes for inner kernel --
-
-#define BLIS_DEFAULT_MR_S              8
-#define BLIS_DEFAULT_NR_S              2
-
-#define BLIS_DEFAULT_MR_D              4
-#define BLIS_DEFAULT_NR_D              4
-
-#define BLIS_DEFAULT_MR_C              4
-#define BLIS_DEFAULT_NR_C              1
-
-#define BLIS_DEFAULT_MR_Z              2
-#define BLIS_DEFAULT_NR_Z              1
-
-// NOTE: If the micro-kernel, which is typically unrolled to a factor
-// of f, handles leftover edge cases (ie: when k % f > 0) then these
-// register blocksizes in the k dimension can be defined to 1.
-
-#define BLIS_DEFAULT_KR_S              1
-#define BLIS_DEFAULT_KR_D              1
-#define BLIS_DEFAULT_KR_C              1
-#define BLIS_DEFAULT_KR_Z              1
-
-
-// -- Default switch for duplication of B --
-
-// NOTE: If BLIS_DEFAULT_DUPLICATE_B is set to FALSE, then the
-// NUM_DUPL definitions are not used.
-
-//#define BLIS_DEFAULT_DUPLICATE_B       TRUE
-#define BLIS_DEFAULT_DUPLICATE_B       FALSE
-#define BLIS_DEFAULT_NUM_DUPL_S        BLIS_NUM_ELEM_PER_REG_S
-#define BLIS_DEFAULT_NUM_DUPL_D        BLIS_NUM_ELEM_PER_REG_D
-#define BLIS_DEFAULT_NUM_DUPL_C        BLIS_NUM_ELEM_PER_REG_C
-#define BLIS_DEFAULT_NUM_DUPL_Z        BLIS_NUM_ELEM_PER_REG_Z
-
-
-// -- Default incremental packing blocksizes (n dimension) --
-
-// NOTE: These incremental packing blocksizes (for the n dimension) are only
-// used by certain blocked variants. But when the *are* used, they MUST be
-// be an integer multiple of NR!
-
-#define BLIS_DEFAULT_NI_FAC            16
-#define BLIS_DEFAULT_NI_S              (BLIS_DEFAULT_NI_FAC * BLIS_DEFAULT_NR_S)
-#define BLIS_DEFAULT_NI_D              (BLIS_DEFAULT_NI_FAC * BLIS_DEFAULT_NR_D)
-#define BLIS_DEFAULT_NI_C              (BLIS_DEFAULT_NI_FAC * BLIS_DEFAULT_NR_C)
-#define BLIS_DEFAULT_NI_Z              (BLIS_DEFAULT_NI_FAC * BLIS_DEFAULT_NR_Z)
-
-
-// -- Default register blocksizes for vectors --
-
-// NOTE: Register blocksizes for vectors are used when packing
-// non-contiguous vectors. Similar to that of KR, they can
-// typically be set to 1.
-
-#define BLIS_DEFAULT_VR_S              1
-#define BLIS_DEFAULT_VR_D              1
-#define BLIS_DEFAULT_VR_C              1
-#define BLIS_DEFAULT_VR_Z              1
-
-
-
-
-#endif
diff --git a/config/clarksville/bl2_config.h b/config/clarksville/bl2_config.h
index 417d4dce5e..eefaab61af 100644
--- a/config/clarksville/bl2_config.h
+++ b/config/clarksville/bl2_config.h
@@ -35,21 +35,50 @@
 #ifndef BLIS_CONFIG_H
 #define BLIS_CONFIG_H
 
+
+// -- OPERATING SYSTEM ---------------------------------------------------------
+
 // Declaration for posix_memalign() needs this.
 #define _GNU_SOURCE 1
 
+
+
+// -- FLOATING-POINT PROPERTIES ------------------------------------------------
+
+#define BLIS_NUM_FP_TYPES                4
+#define BLIS_MAX_TYPE_SIZE               sizeof(dcomplex)
+
+
+
+// -- MEMORY ALLOCATOR ---------------------------------------------------------
+
+// Static memory pool size.
+#define BLIS_STATIC_MEM_POOL_SIZE        (256 * 5000 * sizeof(double))
+
 // Enable memory alignment?
-#define BLIS_ENABLE_MEMORY_ALIGNMENT 1
+#define BLIS_ENABLE_MEMORY_ALIGNMENT     1
 
-// Memory alignment boundary.
+// If memory alignment is enabled, set the alignment boundary.
 #ifndef BLIS_MEMORY_ALIGNMENT_BOUNDARY
   #define BLIS_MEMORY_ALIGNMENT_BOUNDARY 16
 #endif
 
-// Static memory pool size.
-#define BLIS_STATIC_MEM_POOL_SIZE (256 * 5000 * sizeof(double))
+// The page size is used by the memory allocator so that static memory
+// can be allocated with alignment to the beginning of a page boundary.
+#define BLIS_PAGE_SIZE                   4096
+
+// The maximum prefetch byte offset is used to pad the end of any static
+// memory allocation request so that the micro-kernel can exceed the
+// bounds of the usable portion of a memory region without causing a
+// segmentation fault.
+#define BLIS_MAX_PREFETCH_BYTE_OFFSET    128
+
 
 
+// -- MIXED DATATYPE SUPPORT ---------------------------------------------------
+
+// Basic (homogeneous) datatype support always enabled.
+
 // Enable mixed domain operations?
 //#define BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
 
@@ -58,4 +87,141 @@
 
 
 
+// -- LEVEL-3 MICRO-KERNEL CONSTANTS -------------------------------------------
+
+// -- Default cache blocksizes --
+
+// Constraints:
+//
+// (1) MC must be a multiple of:
+//     (a) MR (for zero-padding purposes) and
+//     (b) NR.
+// (2) NC must be a multiple of
+//     (a) NR (for zero-padding purposes) and
+//     (b) MR.
+// (3) KC does not need to be multiple of anything, unless the micro-kernel
+//     specifically requires it (and typically it does not).
+// 
+// NOTE: For BLIS libraries built on block-panel macro-kernels, constraint
+// (2b) is relaxed. In this case, (1b) is needed for operation implementations
+// involving matrices with diagonals (trmm, trsm). In these cases, we want the
+// diagonal offset of any panel of packed matrix A to have a diagonal offset
+// that is a multiple of MR. If, instead, the library were to be built on
+// block-panel macro-kernels, matrix B would be the one with structure, not A,
+// and thus it would be constraint (2b) that would be needed instead of (1b).
+//
+
+#define BLIS_DEFAULT_MC_S              128
+#define BLIS_DEFAULT_KC_S              256
+#define BLIS_DEFAULT_NC_S              8192
+
+#define BLIS_DEFAULT_MC_D              368
+#define BLIS_DEFAULT_KC_D              256
+#define BLIS_DEFAULT_NC_D              8192
+
+#define BLIS_DEFAULT_MC_C              128
+#define BLIS_DEFAULT_KC_C              256
+#define BLIS_DEFAULT_NC_C              8192
+
+#define BLIS_DEFAULT_MC_Z              128
+#define BLIS_DEFAULT_KC_Z              256
+#define BLIS_DEFAULT_NC_Z              8192
+
+// -- Default register blocksizes for inner kernel --
+
+// NOTE: When using the reference configuration, these register blocksizes
+// in the m and n dimensions should all be equal to the size expected by
+// the reference micro-kernel(s).
+
+#define BLIS_DEFAULT_MR_S              4
+#define BLIS_DEFAULT_NR_S              4
+
+#define BLIS_DEFAULT_MR_D              4
+#define BLIS_DEFAULT_NR_D              4
+
+#define BLIS_DEFAULT_MR_C              4
+#define BLIS_DEFAULT_NR_C              4
+
+#define BLIS_DEFAULT_MR_Z              4
+#define BLIS_DEFAULT_NR_Z              4
+
+// NOTE: If the micro-kernel, which is typically unrolled to a factor
+// of f, handles leftover edge cases (ie: when k % f > 0) then these
+// register blocksizes in the k dimension can be defined to 1.
+
+#define BLIS_DEFAULT_KR_S              1
+#define BLIS_DEFAULT_KR_D              1
+#define BLIS_DEFAULT_KR_C              1
+#define BLIS_DEFAULT_KR_Z              1
+
+// -- Number of elements per vector register --
+
+// NOTE: These constants are typically only used to determine the amount
+// of duplication needed when configuring level-3 macro-kernels that
+// copy and duplicate elements of B to a temporary duplication buffer
+// (so that element-wise vector multiplication and addition instructions
+// can be used).
+
+#define BLIS_NUM_ELEM_PER_REG_S        4
+#define BLIS_NUM_ELEM_PER_REG_D        2
+#define BLIS_NUM_ELEM_PER_REG_C        2
+#define BLIS_NUM_ELEM_PER_REG_Z        1
+
+// -- Default switch for duplication of B --
+
+// NOTE: If BLIS_DEFAULT_DUPLICATE_B is set to FALSE, then the
+// NUM_DUPL definitions are not used.
+
+//#define BLIS_DEFAULT_DUPLICATE_B       TRUE
+#define BLIS_DEFAULT_DUPLICATE_B       FALSE
+#define BLIS_DEFAULT_NUM_DUPL_S        BLIS_NUM_ELEM_PER_REG_S
+#define BLIS_DEFAULT_NUM_DUPL_D        BLIS_NUM_ELEM_PER_REG_D
+#define BLIS_DEFAULT_NUM_DUPL_C        BLIS_NUM_ELEM_PER_REG_C
+#define BLIS_DEFAULT_NUM_DUPL_Z        BLIS_NUM_ELEM_PER_REG_Z
+
+// -- Default incremental packing blocksizes (n dimension) --
+
+// NOTE: These incremental packing blocksizes (for the n dimension) are only
+// used by certain blocked variants. But when the *are* used, they MUST be
+// be an integer multiple of NR!
+
+#define BLIS_DEFAULT_NI_FAC            16
+#define BLIS_DEFAULT_NI_S              (BLIS_DEFAULT_NI_FAC * BLIS_DEFAULT_NR_S)
+#define BLIS_DEFAULT_NI_D              (BLIS_DEFAULT_NI_FAC * BLIS_DEFAULT_NR_D)
+#define BLIS_DEFAULT_NI_C              (BLIS_DEFAULT_NI_FAC * BLIS_DEFAULT_NR_C)
+#define BLIS_DEFAULT_NI_Z              (BLIS_DEFAULT_NI_FAC * BLIS_DEFAULT_NR_Z)
+
+
+
+// -- LEVEL-1F KERNEL CONSTANTS ------------------------------------------------
+
+// -- Default fusing factors for level-1f operations --
+
+// NOTE: Default fusing factors are not used by the reference implementations
+// of level-1f operations. They are here only for use when these operations
+// are optimized.
+
+#define BLIS_DEFAULT_FUSING_FACTOR_S   8
+#define BLIS_DEFAULT_FUSING_FACTOR_D   4
+#define BLIS_DEFAULT_FUSING_FACTOR_C   4
+#define BLIS_DEFAULT_FUSING_FACTOR_Z   2
+
+
+
+// -- LEVEL-1V KERNEL CONSTANTS ------------------------------------------------
+
+// -- Default register blocksizes for vectors --
+
+// NOTE: Register blocksizes for vectors are used when packing
+// non-contiguous vectors. Similar to that of KR, they can
+// typically be set to 1.
+
+#define BLIS_DEFAULT_VR_S              1
+#define BLIS_DEFAULT_VR_D              1
+#define BLIS_DEFAULT_VR_C              1
+#define BLIS_DEFAULT_VR_Z              1
+
+
+
+
 #endif
diff --git a/config/reference/bl2_arch.h b/config/reference/bl2_arch.h
deleted file mode 100644
index abc7649996..0000000000
--- a/config/reference/bl2_arch.h
+++ /dev/null
@@ -1,173 +0,0 @@
-/*
-
-   BLIS    
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2012, The University of Texas
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name of The University of Texas nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#ifndef BLIS_ARCH_H
-#define BLIS_ARCH_H
-
-// -- General floating-point constants --
-
-#define BLIS_NUM_FP_TYPES              4
-#define BLIS_MAX_TYPE_SIZE             sizeof(dcomplex)
-
-
-// -- Maximum offset of an element that might be pre-loaded/prefetched --
-
-#define BLIS_MAX_PREFETCH_BYTE_OFFSET  128
-
-
-// -- Page size --
-
-#define BLIS_PAGE_SIZE                 4096
-
-
-// -- Number of elements per vector register --
-
-#define BLIS_NUM_ELEM_PER_REG_S        4
-#define BLIS_NUM_ELEM_PER_REG_D        2
-#define BLIS_NUM_ELEM_PER_REG_C        2
-#define BLIS_NUM_ELEM_PER_REG_Z        1
-
-
-// -- Default fusing factors for level-1 fused operations --
-
-#define BLIS_DEFAULT_FUSING_FACTOR_S   8
-#define BLIS_DEFAULT_FUSING_FACTOR_D   4
-#define BLIS_DEFAULT_FUSING_FACTOR_C   4
-#define BLIS_DEFAULT_FUSING_FACTOR_Z   2
-
-
-// -- Default cache blocksizes --
-
-// Constraints:
-//
-// (1) MC must be a multiple of:
-//     (a) MR (for zero-padding purposes) and
-//     (b) NR.
-// (2) NC must be a multiple of
-//     (a) NR (for zero-padding purposes) and
-//     (b) MR.
-// (3) KC does not need to be multiple of anything, unless the micro-kernel
-//     specifically requires it (and typically it does not).
-// 
-// NOTE: For BLIS libraries built on block-panel macro-kernels, constraint
-// (2b) is relaxed. In this case, (1b) is needed for operation implementations
-// involving matrices with diagonals (trmm, trsm). In these cases, we want the
-// diagonal offset of any panel of packed matrix A to have a diagonal offset
-// that is a multiple of MR. If, instead, the library were to be built on
-// block-panel macro-kernels, matrix B would be the one with structure, not A,
-// and thus it would be constraint (2b) that would be needed instead of (1b).
-//
-
-#define BLIS_DEFAULT_MC_S              128
-#define BLIS_DEFAULT_KC_S              256
-#define BLIS_DEFAULT_NC_S              8192
-
-#define BLIS_DEFAULT_MC_D              128
-#define BLIS_DEFAULT_KC_D              256
-#define BLIS_DEFAULT_NC_D              8192
-
-#define BLIS_DEFAULT_MC_C              128
-#define BLIS_DEFAULT_KC_C              256
-#define BLIS_DEFAULT_NC_C              8192
-
-#define BLIS_DEFAULT_MC_Z              128
-#define BLIS_DEFAULT_KC_Z              256
-#define BLIS_DEFAULT_NC_Z              8192
-
-
-// -- Default register blocksizes for inner kernel --
-
-#define BLIS_DEFAULT_MR_S              8
-#define BLIS_DEFAULT_NR_S              2
-
-#define BLIS_DEFAULT_MR_D              4
-#define BLIS_DEFAULT_NR_D              4
-
-#define BLIS_DEFAULT_MR_C              4
-#define BLIS_DEFAULT_NR_C              1
-
-#define BLIS_DEFAULT_MR_Z              2
-#define BLIS_DEFAULT_NR_Z              1
-
-// NOTE: If the micro-kernel, which is typically unrolled to a factor
-// of f, handles leftover edge cases (ie: when k % f > 0) then these
-// register blocksizes in the k dimension can be defined to 1.
-
-#define BLIS_DEFAULT_KR_S              1
-#define BLIS_DEFAULT_KR_D              1
-#define BLIS_DEFAULT_KR_C              1
-#define BLIS_DEFAULT_KR_Z              1
-
-
-// -- Default switch for duplication of B --
-
-// NOTE: If BLIS_DEFAULT_DUPLICATE_B is set to FALSE, then the
-// NUM_DUPL definitions are not used.
-
-//#define BLIS_DEFAULT_DUPLICATE_B       TRUE
-#define BLIS_DEFAULT_DUPLICATE_B       FALSE
-#define BLIS_DEFAULT_NUM_DUPL_S        BLIS_NUM_ELEM_PER_REG_S
-#define BLIS_DEFAULT_NUM_DUPL_D        BLIS_NUM_ELEM_PER_REG_D
-#define BLIS_DEFAULT_NUM_DUPL_C        BLIS_NUM_ELEM_PER_REG_C
-#define BLIS_DEFAULT_NUM_DUPL_Z        BLIS_NUM_ELEM_PER_REG_Z
-
-
-// -- Default incremental packing blocksizes (n dimension) --
-
-// NOTE: These incremental packing blocksizes (for the n dimension) are only
-// used by certain blocked variants. But when the *are* used, they MUST be
-// be an integer multiple of NR!
-
-#define BLIS_DEFAULT_NI_FAC            16
-#define BLIS_DEFAULT_NI_S              (BLIS_DEFAULT_NI_FAC * BLIS_DEFAULT_NR_S)
-#define BLIS_DEFAULT_NI_D              (BLIS_DEFAULT_NI_FAC * BLIS_DEFAULT_NR_D)
-#define BLIS_DEFAULT_NI_C              (BLIS_DEFAULT_NI_FAC * BLIS_DEFAULT_NR_C)
-#define BLIS_DEFAULT_NI_Z              (BLIS_DEFAULT_NI_FAC * BLIS_DEFAULT_NR_Z)
-
-
-// -- Default register blocksizes for vectors --
-
-// NOTE: Register blocksizes for vectors are used when packing
-// non-contiguous vectors. Similar to that of KR, they can
-// typically be set to 1.
-
-#define BLIS_DEFAULT_VR_S              1
-#define BLIS_DEFAULT_VR_D              1
-#define BLIS_DEFAULT_VR_C              1
-#define BLIS_DEFAULT_VR_Z              1
-
-
-
-
-#endif
diff --git a/config/reference/bl2_config.h b/config/reference/bl2_config.h
index 417d4dce5e..6b84306027 100644
--- a/config/reference/bl2_config.h
+++ b/config/reference/bl2_config.h
@@ -35,21 +35,50 @@
 #ifndef BLIS_CONFIG_H
 #define BLIS_CONFIG_H
 
+
+// -- OPERATING SYSTEM ---------------------------------------------------------
+
 // Declaration for posix_memalign() needs this.
 #define _GNU_SOURCE 1
 
+
+
+// -- FLOATING-POINT PROPERTIES ------------------------------------------------
+
+#define BLIS_NUM_FP_TYPES                4
+#define BLIS_MAX_TYPE_SIZE               sizeof(dcomplex)
+
+
+
+// -- MEMORY ALLOCATOR ---------------------------------------------------------
+
+// Static memory pool size.
+#define BLIS_STATIC_MEM_POOL_SIZE        (256 * 5000 * sizeof(double))
+
 // Enable memory alignment?
-#define BLIS_ENABLE_MEMORY_ALIGNMENT 1
+#define BLIS_ENABLE_MEMORY_ALIGNMENT     1
 
-// Memory alignment boundary.
+// If memory alignment is enabled, set the alignment boundary.
 #ifndef BLIS_MEMORY_ALIGNMENT_BOUNDARY
   #define BLIS_MEMORY_ALIGNMENT_BOUNDARY 16
 #endif
 
-// Static memory pool size.
-#define BLIS_STATIC_MEM_POOL_SIZE (256 * 5000 * sizeof(double))
+// The page size is used by the memory allocator so that static memory
+// can be allocated with alignment to the beginning of a page boundary.
+#define BLIS_PAGE_SIZE                   4096
+
+// The maximum prefetch byte offset is used to pad the end of any static
+// memory allocation request so that the micro-kernel can exceed the
+// bounds of the usable portion of a memory region without causing a
+// segmentation fault.
+#define BLIS_MAX_PREFETCH_BYTE_OFFSET    128
+
 
 
+// -- MIXED DATATYPE SUPPORT ---------------------------------------------------
+
+// Basic (homogeneous) datatype support always enabled.
+
 // Enable mixed domain operations?
 //#define BLIS_ENABLE_MIXED_DOMAIN_SUPPORT
 
@@ -58,4 +87,141 @@
 
 
 
+// -- LEVEL-3 MICRO-KERNEL CONSTANTS -------------------------------------------
+
+// -- Default cache blocksizes --
+
+// Constraints:
+//
+// (1) MC must be a multiple of:
+//     (a) MR (for zero-padding purposes) and
+//     (b) NR.
+// (2) NC must be a multiple of
+//     (a) NR (for zero-padding purposes) and
+//     (b) MR.
+// (3) KC does not need to be multiple of anything, unless the micro-kernel
+//     specifically requires it (and typically it does not).
+// 
+// NOTE: For BLIS libraries built on block-panel macro-kernels, constraint
+// (2b) is relaxed. In this case, (1b) is needed for operation implementations
+// involving matrices with diagonals (trmm, trsm). In these cases, we want the
+// diagonal offset of any panel of packed matrix A to have a diagonal offset
+// that is a multiple of MR. If, instead, the library were to be built on
+// block-panel macro-kernels, matrix B would be the one with structure, not A,
+// and thus it would be constraint (2b) that would be needed instead of (1b).
+//
+
+#define BLIS_DEFAULT_MC_S              128
+#define BLIS_DEFAULT_KC_S              256
+#define BLIS_DEFAULT_NC_S              8192
+
+#define BLIS_DEFAULT_MC_D              128
+#define BLIS_DEFAULT_KC_D              256
+#define BLIS_DEFAULT_NC_D              8192
+
+#define BLIS_DEFAULT_MC_C              128
+#define BLIS_DEFAULT_KC_C              256
+#define BLIS_DEFAULT_NC_C              8192
+
+#define BLIS_DEFAULT_MC_Z              128
+#define BLIS_DEFAULT_KC_Z              256
+#define BLIS_DEFAULT_NC_Z              8192
+
+// -- Default register blocksizes for inner kernel --
+
+// NOTE: When using the reference configuration, these register blocksizes
+// in the m and n dimensions should all be equal to the size expected by
+// the reference micro-kernel(s).
+
+#define BLIS_DEFAULT_MR_S              4
+#define BLIS_DEFAULT_NR_S              4
+
+#define BLIS_DEFAULT_MR_D              4
+#define BLIS_DEFAULT_NR_D              4
+
+#define BLIS_DEFAULT_MR_C              4
+#define BLIS_DEFAULT_NR_C              4
+
+#define BLIS_DEFAULT_MR_Z              4
+#define BLIS_DEFAULT_NR_Z              4
+
+// NOTE: If the micro-kernel, which is typically unrolled to a factor
+// of f, handles leftover edge cases (ie: when k % f > 0) then these
+// register blocksizes in the k dimension can be defined to 1.
+
+#define BLIS_DEFAULT_KR_S              1
+#define BLIS_DEFAULT_KR_D              1
+#define BLIS_DEFAULT_KR_C              1
+#define BLIS_DEFAULT_KR_Z              1
+
+// -- Number of elements per vector register --
+
+// NOTE: These constants are typically only used to determine the amount
+// of duplication needed when configuring level-3 macro-kernels that
+// copy and duplicate elements of B to a temporary duplication buffer
+// (so that element-wise vector multiplication and addition instructions
+// can be used).
+
+#define BLIS_NUM_ELEM_PER_REG_S        4
+#define BLIS_NUM_ELEM_PER_REG_D        2
+#define BLIS_NUM_ELEM_PER_REG_C        2
+#define BLIS_NUM_ELEM_PER_REG_Z        1
+
+// -- Default switch for duplication of B --
+
+// NOTE: If BLIS_DEFAULT_DUPLICATE_B is set to FALSE, then the
+// NUM_DUPL definitions are not used.
+
+//#define BLIS_DEFAULT_DUPLICATE_B       TRUE
+#define BLIS_DEFAULT_DUPLICATE_B       FALSE
+#define BLIS_DEFAULT_NUM_DUPL_S        BLIS_NUM_ELEM_PER_REG_S
+#define BLIS_DEFAULT_NUM_DUPL_D        BLIS_NUM_ELEM_PER_REG_D
+#define BLIS_DEFAULT_NUM_DUPL_C        BLIS_NUM_ELEM_PER_REG_C
+#define BLIS_DEFAULT_NUM_DUPL_Z        BLIS_NUM_ELEM_PER_REG_Z
+
+// -- Default incremental packing blocksizes (n dimension) --
+
+// NOTE: These incremental packing blocksizes (for the n dimension) are only
+// used by certain blocked variants. But when the *are* used, they MUST be
+// be an integer multiple of NR!
+
+#define BLIS_DEFAULT_NI_FAC            16
+#define BLIS_DEFAULT_NI_S              (BLIS_DEFAULT_NI_FAC * BLIS_DEFAULT_NR_S)
+#define BLIS_DEFAULT_NI_D              (BLIS_DEFAULT_NI_FAC * BLIS_DEFAULT_NR_D)
+#define BLIS_DEFAULT_NI_C              (BLIS_DEFAULT_NI_FAC * BLIS_DEFAULT_NR_C)
+#define BLIS_DEFAULT_NI_Z              (BLIS_DEFAULT_NI_FAC * BLIS_DEFAULT_NR_Z)
+
+
+
+// -- LEVEL-1F KERNEL CONSTANTS ------------------------------------------------
+
+// -- Default fusing factors for level-1f operations --
+
+// NOTE: Default fusing factors are not used by the reference implementations
+// of level-1f operations. They are here only for use when these operations
+// are optimized.
+
+#define BLIS_DEFAULT_FUSING_FACTOR_S   8
+#define BLIS_DEFAULT_FUSING_FACTOR_D   4
+#define BLIS_DEFAULT_FUSING_FACTOR_C   4
+#define BLIS_DEFAULT_FUSING_FACTOR_Z   2
+
+
+
+// -- LEVEL-1V KERNEL CONSTANTS ------------------------------------------------
+
+// -- Default register blocksizes for vectors --
+
+// NOTE: Register blocksizes for vectors are used when packing
+// non-contiguous vectors. Similar to that of KR, they can
+// typically be set to 1.
+
+#define BLIS_DEFAULT_VR_S              1
+#define BLIS_DEFAULT_VR_D              1
+#define BLIS_DEFAULT_VR_C              1
+#define BLIS_DEFAULT_VR_Z              1
+
+
+
+
 #endif
diff --git a/frame/include/blis2.h b/frame/include/blis2.h
index e1475a6158..896f7bc697 100644
--- a/frame/include/blis2.h
+++ b/frame/include/blis2.h
@@ -46,13 +46,11 @@ extern "C" {
 
 // -- BLIS configuration definition --
 
-// NOTE: These definitions are placed here mainly because there might be
-// something in bl2_config.h that is needed within one of the system
-// headers. A good example: posix_memalign() needs _GNU_SOURCE on GNU
-// systems (I think).
+// NOTE: We include bl2_config.h here because there might be something
+// defined there that is needed within one of the system headers. A good
+// example: posix_memalign() needs _GNU_SOURCE on GNU systems (I think).
 
 #include "bl2_config.h"
-#include "bl2_arch.h"
 
 
 // -- System headers --
@@ -74,7 +72,7 @@ extern "C" {
 #include "bl2_extern_defs.h"
 
 
-// -- BLIS architecture / kernel definitions --
+// -- BLIS kernel definitions --
 
 #include "bl2_kernel.h"
 
diff --git a/version b/version
index 8a8a4eb62b..f081046ced 100644
--- a/version
+++ b/version
@@ -1 +1 @@
-0.0.1-2
+0.0.1-3