diff --git a/LICENSE.txt b/LICENSE.txt index 419a72662b..aa211ad360 100644 --- a/LICENSE.txt +++ b/LICENSE.txt @@ -946,3 +946,84 @@ Some portions of this module are derived from code from FastHash OTHER DEALINGS IN THE SOFTWARE. -------------------------------------------------------------------------------- + +be/src/thirdparty/mpfit + +MPFIT: A MINPACK-1 Least Squares Fitting Library in C + +Original public domain version by B. Garbow, K. Hillstrom, J. More' + (Argonne National Laboratory, MINPACK project, March 1980) + Copyright (1999) University of Chicago + (see below) + +Tranlation to C Language by S. Moshier (moshier.net) + (no restrictions placed on distribution) + +Enhancements and packaging by C. Markwardt + (comparable to IDL fitting routine MPFIT + see http://cow.physics.wisc.edu/~craigm/idl/idl.html) + Copyright (C) 2003, 2004, 2006, 2007 Craig B. Markwardt + + This software is provided as is without any warranty whatsoever. + Permission to use, copy, modify, and distribute modified or + unmodified copies is granted, provided this copyright and disclaimer + are included unchanged. + + +Source code derived from MINPACK must have the following disclaimer +text provided. + +=========================================================================== +Minpack Copyright Notice (1999) University of Chicago. All rights reserved + +Redistribution and use in source and binary forms, with or +without modification, are permitted provided that the +following conditions are met: + +1. Redistributions of source code must retain the above +copyright notice, this list of conditions and the following +disclaimer. + +2. Redistributions in binary form must reproduce the above +copyright notice, this list of conditions and the following +disclaimer in the documentation and/or other materials +provided with the distribution. + +3. The end-user documentation included with the +redistribution, if any, must include the following +acknowledgment: + + "This product includes software developed by the + University of Chicago, as Operator of Argonne National + Laboratory. + +Alternately, this acknowledgment may appear in the software +itself, if and wherever such third-party acknowledgments +normally appear. + +4. WARRANTY DISCLAIMER. THE SOFTWARE IS SUPPLIED "AS IS" +WITHOUT WARRANTY OF ANY KIND. THE COPYRIGHT HOLDER, THE +UNITED STATES, THE UNITED STATES DEPARTMENT OF ENERGY, AND +THEIR EMPLOYEES: (1) DISCLAIM ANY WARRANTIES, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO ANY IMPLIED WARRANTIES +OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, TITLE +OR NON-INFRINGEMENT, (2) DO NOT ASSUME ANY LEGAL LIABILITY +OR RESPONSIBILITY FOR THE ACCURACY, COMPLETENESS, OR +USEFULNESS OF THE SOFTWARE, (3) DO NOT REPRESENT THAT USE OF +THE SOFTWARE WOULD NOT INFRINGE PRIVATELY OWNED RIGHTS, (4) +DO NOT WARRANT THAT THE SOFTWARE WILL FUNCTION +UNINTERRUPTED, THAT IT IS ERROR-FREE OR THAT ANY ERRORS WILL +BE CORRECTED. + +5. LIMITATION OF LIABILITY. IN NO EVENT WILL THE COPYRIGHT +HOLDER, THE UNITED STATES, THE UNITED STATES DEPARTMENT OF +ENERGY, OR THEIR EMPLOYEES: BE LIABLE FOR ANY INDIRECT, +INCIDENTAL, CONSEQUENTIAL, SPECIAL OR PUNITIVE DAMAGES OF +ANY KIND OR NATURE, INCLUDING BUT NOT LIMITED TO LOSS OF +PROFITS OR LOSS OF DATA, FOR ANY REASON WHATSOEVER, WHETHER +SUCH LIABILITY IS ASSERTED ON THE BASIS OF CONTRACT, TORT +(INCLUDING NEGLIGENCE OR STRICT LIABILITY), OR OTHERWISE, +EVEN IF ANY OF SAID PARTIES HAS BEEN WARNED OF THE +POSSIBILITY OF SUCH LOSS OR DAMAGES. + +-------------------------------------------------------------------------------- diff --git a/NOTICE.txt b/NOTICE.txt index 3760589754..c2875a2fd4 100644 --- a/NOTICE.txt +++ b/NOTICE.txt @@ -13,3 +13,7 @@ Project for use in the OpenSSL Toolkit (http://www.openssl.org/) This product includes cryptographic software written by Eric Young (eay@cryptsoft.com). This product includes software written by Tim Hudson (tjh@cryptsoft.com). + +This product includes software developed by the University of Chicago, +as Operator of Argonne National Laboratory. +Copyright (C) 1999 University of Chicago. All rights reserved. \ No newline at end of file diff --git a/be/src/exprs/aggregate-functions-ir.cc b/be/src/exprs/aggregate-functions-ir.cc index d5f946a773..499188c6ba 100644 --- a/be/src/exprs/aggregate-functions-ir.cc +++ b/be/src/exprs/aggregate-functions-ir.cc @@ -17,24 +17,25 @@ #include "exprs/aggregate-functions.h" -#include #include #include #include #include +#include #include #include #include "codegen/impala-ir.h" #include "common/logging.h" +#include "exprs/anyval-util.h" +#include "exprs/hll-bias.h" #include "runtime/decimal-value.inline.h" #include "runtime/runtime-state.h" #include "runtime/string-value.inline.h" #include "runtime/timestamp-value.h" #include "runtime/timestamp-value.inline.h" -#include "exprs/anyval-util.h" -#include "exprs/hll-bias.h" +#include "util/mpfit-util.h" #include "common/names.h" @@ -42,6 +43,7 @@ using boost::uniform_int; using boost::ranlux64_3; using std::make_pair; using std::map; +using std::min_element; using std::nth_element; using std::pop_heap; using std::push_heap; @@ -1480,6 +1482,186 @@ BigIntVal AggregateFunctions::HllFinalize(FunctionContext* ctx, const StringVal& return estimate; } +/// Intermediate aggregation state for the SampledNdv() function. +/// Stores NUM_HLL_BUCKETS of the form . +/// The 'row_count' keeps track of how many input rows were aggregated into that +/// bucket, and the 'hll_state' is an intermediate aggregation state of HyperLogLog. +/// See the header comments on the SampledNdv() function for more details. +class SampledNdvState { + public: + /// Empirically determined number of HLL buckets. Power of two for fast modulo. + static const uint32_t NUM_HLL_BUCKETS = 32; + + /// A bucket contains an update count and an HLL intermediate state. + static constexpr int64_t BUCKET_SIZE = sizeof(int64_t) + AggregateFunctions::HLL_LEN; + + /// Sampling percent which was given as the second argument to SampledNdv(). + /// Stored here to avoid existing issues with passing constant arguments to all + /// aggregation phases and because we convert the sampling percent argument from + /// decimal to double. See IMPALA-6179. + double sample_perc; + + /// Counts the number of Update() calls. Used for determining which bucket to update. + int64_t total_row_count; + + /// Array of buckets. + struct { + int64_t row_count; + uint8_t hll[AggregateFunctions::HLL_LEN]; + } buckets[NUM_HLL_BUCKETS]; +}; + +void AggregateFunctions::SampledNdvInit(FunctionContext* ctx, StringVal* dst) { + // Uses a preallocated FIXED_UDA_INTERMEDIATE intermediate value. + DCHECK_EQ(dst->len, sizeof(SampledNdvState)); + memset(dst->ptr, 0, sizeof(SampledNdvState)); + + DoubleVal* sample_perc = reinterpret_cast(ctx->GetConstantArg(1)); + if (sample_perc == nullptr) return; + // Guaranteed by the FE. + DCHECK(!sample_perc->is_null); + DCHECK_GE(sample_perc->val, 0.0); + DCHECK_LE(sample_perc->val, 1.0); + SampledNdvState* state = reinterpret_cast(dst->ptr); + state->sample_perc = sample_perc->val; +} + +/// Incorporate the 'src' into one of the intermediate HLLs, which will be used by +/// Finalize() to generate a set of the (x,y) data points. +template +void AggregateFunctions::SampledNdvUpdate(FunctionContext* ctx, const T& src, + const DoubleVal& sample_perc, StringVal* dst) { + SampledNdvState* state = reinterpret_cast(dst->ptr); + int64_t bucket_idx = state->total_row_count % SampledNdvState::NUM_HLL_BUCKETS; + StringVal hll_dst = StringVal(state->buckets[bucket_idx].hll, HLL_LEN); + HllUpdate(ctx, src, &hll_dst); + ++state->buckets[bucket_idx].row_count; + ++state->total_row_count; +} + +void AggregateFunctions::SampledNdvMerge(FunctionContext* ctx, const StringVal& src, + StringVal* dst) { + SampledNdvState* src_state = reinterpret_cast(src.ptr); + SampledNdvState* dst_state = reinterpret_cast(dst->ptr); + for (int i = 0; i < SampledNdvState::NUM_HLL_BUCKETS; ++i) { + StringVal src_hll = StringVal(src_state->buckets[i].hll, HLL_LEN); + StringVal dst_hll = StringVal(dst_state->buckets[i].hll, HLL_LEN); + HllMerge(ctx, src_hll, &dst_hll); + dst_state->buckets[i].row_count += src_state->buckets[i].row_count; + } + // Total count. Not really needed after Update() but kept for sanity checking. + dst_state->total_row_count += src_state->total_row_count; + // Propagate sampling percent to Finalize(). + dst_state->sample_perc = src_state->sample_perc; +} + +BigIntVal AggregateFunctions::SampledNdvFinalize(FunctionContext* ctx, + const StringVal& src) { + SampledNdvState* state = reinterpret_cast(src.ptr); + + // Generate 'num_points' data points with x=row_count and y=ndv_estimate. These points + // are used to fit a function for the NDV growth and estimate the real NDV. + constexpr int num_points = + SampledNdvState::NUM_HLL_BUCKETS * SampledNdvState::NUM_HLL_BUCKETS; + int64_t counts[num_points] = { 0 }; + int64_t ndvs[num_points] = { 0 }; + + int64_t min_ndv = numeric_limits::max(); + int64_t min_count = numeric_limits::max(); + // We have a fixed number of HLL intermediates to generate data points. Any unique + // subset of intermediates can be combined to create a new data point. It was + // empirically determined that 'num_data' points is typically sufficient and there are + // diminishing returns from generating additional data points. + // The generation method below was chosen for its simplicity. It successively merges + // buckets in a rolling window of size NUM_HLL_BUCKETS. Repeating the last data point + // where all buckets are merged biases the curve fitting to hit that data point which + // makes sense because that's likely the most accurate one. The number of data points + // are sufficient for reasonable accuracy. + int pidx = 0; + for (int i = 0; i < SampledNdvState::NUM_HLL_BUCKETS; ++i) { + uint8_t merged_hll_data[HLL_LEN]; + memset(merged_hll_data, 0, HLL_LEN); + StringVal merged_hll(merged_hll_data, HLL_LEN); + int64_t merged_count = 0; + for (int j = 0; j < SampledNdvState::NUM_HLL_BUCKETS; ++j) { + int bucket_idx = (i + j) % SampledNdvState::NUM_HLL_BUCKETS; + merged_count += state->buckets[bucket_idx].row_count; + counts[pidx] = merged_count; + StringVal hll = StringVal(state->buckets[bucket_idx].hll, HLL_LEN); + HllMerge(ctx, hll, &merged_hll); + ndvs[pidx] = HllFinalEstimate(merged_hll.ptr, HLL_LEN); + ++pidx; + } + min_count = std::min(min_count, state->buckets[i].row_count); + min_ndv = std::min(min_ndv, ndvs[i * SampledNdvState::NUM_HLL_BUCKETS]); + } + // Based on the point-generation method above the last elements represent the data + // point where all buckets are merged. + int64_t max_count = counts[num_points - 1]; + int64_t max_ndv = ndvs[num_points - 1]; + + // Scale all values to [0,1] since some objective functions require it (e.g., Sigmoid). + double count_scale = max_count - min_count; + double ndv_scale = max_ndv - min_ndv; + if (count_scale == 0) count_scale = 1.0; + if (ndv_scale == 0) ndv_scale = 1.0; + double scaled_counts[num_points]; + double scaled_ndvs[num_points]; + for (int i = 0; i < num_points; ++i) { + scaled_counts[i] = counts[i] / count_scale; + scaled_ndvs[i] = ndvs[i] / ndv_scale; + } + + // List of objective functions. Curve fitting will select the best values for the + // parameters a, b, c, d. + vector ndv_fns; + // Linear function: f(x) = a + b * x + ndv_fns.push_back(ObjectiveFunction("LIN", 2, + [](double x, const double* params) -> double { + return params[0] + params[1] * x; + } + )); + // Logarithmic function: f(x) = a + b * log(x) + ndv_fns.push_back(ObjectiveFunction("LOG", 2, + [](double x, const double* params) -> double { + return params[0] + params[1] * log(x); + } + )); + // Power function: f(x) = a + b * pow(x, c) + ndv_fns.push_back(ObjectiveFunction("POW", 3, + [](double x, const double* params) -> double { + return params[0] + params[1] * pow(x, params[2]); + } + )); + // Sigmoid function: f(x) = a + b * (c / (c + pow(d, -x))) + ndv_fns.push_back(ObjectiveFunction("SIG", 4, + [](double x, const double* params) -> double { + return params[0] + params[1] * (params[2] / (params[2] + pow(params[3], -x))); + } + )); + + // Perform least mean squares fitting on all objective functions. + vector valid_ndv_fns; + for (ObjectiveFunction& f: ndv_fns) { + if(f.LmsFit(scaled_counts, scaled_ndvs, num_points)) { + valid_ndv_fns.push_back(std::move(f)); + } + } + + // Select the best-fit function for estimating the NDV. + auto best_fit_fn = min_element(valid_ndv_fns.begin(), valid_ndv_fns.end(), + [](const ObjectiveFunction& a, const ObjectiveFunction& b) -> bool { + return a.GetError() < b.GetError(); + } + ); + + // Compute the extrapolated NDV based on the extrapolated row count. + double extrap_count = max_count / state->sample_perc; + double scaled_extrap_count = extrap_count / count_scale; + double scaled_extrap_ndv = best_fit_fn->GetY(scaled_extrap_count); + return round(scaled_extrap_ndv * ndv_scale); +} + // An implementation of a simple single pass variance algorithm. A standard UDA must // be single pass (i.e. does not scan the table more than once), so the most canonical // two pass approach is not practical. @@ -2140,6 +2322,27 @@ template void AggregateFunctions::HllUpdate( template void AggregateFunctions::HllUpdate( FunctionContext*, const DecimalVal&, StringVal*); +template void AggregateFunctions::SampledNdvUpdate( + FunctionContext*, const BooleanVal&, const DoubleVal&, StringVal*); +template void AggregateFunctions::SampledNdvUpdate( + FunctionContext*, const TinyIntVal&, const DoubleVal&, StringVal*); +template void AggregateFunctions::SampledNdvUpdate( + FunctionContext*, const SmallIntVal&, const DoubleVal&, StringVal*); +template void AggregateFunctions::SampledNdvUpdate( + FunctionContext*, const IntVal&, const DoubleVal&, StringVal*); +template void AggregateFunctions::SampledNdvUpdate( + FunctionContext*, const BigIntVal&, const DoubleVal&, StringVal*); +template void AggregateFunctions::SampledNdvUpdate( + FunctionContext*, const FloatVal&, const DoubleVal&, StringVal*); +template void AggregateFunctions::SampledNdvUpdate( + FunctionContext*, const DoubleVal&, const DoubleVal&, StringVal*); +template void AggregateFunctions::SampledNdvUpdate( + FunctionContext*, const StringVal&, const DoubleVal&, StringVal*); +template void AggregateFunctions::SampledNdvUpdate( + FunctionContext*, const TimestampVal&, const DoubleVal&, StringVal*); +template void AggregateFunctions::SampledNdvUpdate( + FunctionContext*, const DecimalVal&, const DoubleVal&, StringVal*); + template void AggregateFunctions::KnuthVarUpdate( FunctionContext*, const TinyIntVal&, StringVal*); template void AggregateFunctions::KnuthVarUpdate( diff --git a/be/src/exprs/aggregate-functions.h b/be/src/exprs/aggregate-functions.h index 5ebcb97323..81884c1aa3 100644 --- a/be/src/exprs/aggregate-functions.h +++ b/be/src/exprs/aggregate-functions.h @@ -204,6 +204,23 @@ class AggregateFunctions { /// estimates. static uint64_t HllFinalEstimate(const uint8_t* buckets, int32_t num_buckets); + /// Estimates the number of distinct values (NDV) based on a sample of data and the + /// corresponding sampling rate. The main idea of this function is to collect several + /// (x,y) data points where x is the number of rows and y is the corresponding NDV + /// estimate. These data points are used to fit an objective function to the data such + /// that the true NDV can be extrapolated. + /// This aggregate function maintains a fixed number of HyperLogLog intermediates. + /// The Update() phase updates the intermediates in a round-robin fashion. + /// The Merge() phase combines the corresponding intermediates. + /// The Finalize() phase generates (x,y) data points, performs curve fitting, and + /// computes the estimated true NDV. + static void SampledNdvInit(FunctionContext*, StringVal* dst); + template + static void SampledNdvUpdate(FunctionContext*, const T& src, + const DoubleVal& sample_perc, StringVal* dst); + static void SampledNdvMerge(FunctionContext*, const StringVal& src, StringVal* dst); + static BigIntVal SampledNdvFinalize(FunctionContext*, const StringVal& src); + /// Knuth's variance algorithm, more numerically stable than canonical stddev /// algorithms; reference implementation: /// http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Online_algorithm diff --git a/be/src/thirdparty/mpfit/DISCLAIMER b/be/src/thirdparty/mpfit/DISCLAIMER new file mode 100644 index 0000000000..3e1b76fcc9 --- /dev/null +++ b/be/src/thirdparty/mpfit/DISCLAIMER @@ -0,0 +1,77 @@ + +MPFIT: A MINPACK-1 Least Squares Fitting Library in C + +Original public domain version by B. Garbow, K. Hillstrom, J. More' + (Argonne National Laboratory, MINPACK project, March 1980) + Copyright (1999) University of Chicago + (see below) + +Tranlation to C Language by S. Moshier (moshier.net) + (no restrictions placed on distribution) + +Enhancements and packaging by C. Markwardt + (comparable to IDL fitting routine MPFIT + see http://cow.physics.wisc.edu/~craigm/idl/idl.html) + Copyright (C) 2003, 2004, 2006, 2007 Craig B. Markwardt + + This software is provided as is without any warranty whatsoever. + Permission to use, copy, modify, and distribute modified or + unmodified copies is granted, provided this copyright and disclaimer + are included unchanged. + + +Source code derived from MINPACK must have the following disclaimer +text provided. + +=========================================================================== +Minpack Copyright Notice (1999) University of Chicago. All rights reserved + +Redistribution and use in source and binary forms, with or +without modification, are permitted provided that the +following conditions are met: + +1. Redistributions of source code must retain the above +copyright notice, this list of conditions and the following +disclaimer. + +2. Redistributions in binary form must reproduce the above +copyright notice, this list of conditions and the following +disclaimer in the documentation and/or other materials +provided with the distribution. + +3. The end-user documentation included with the +redistribution, if any, must include the following +acknowledgment: + + "This product includes software developed by the + University of Chicago, as Operator of Argonne National + Laboratory. + +Alternately, this acknowledgment may appear in the software +itself, if and wherever such third-party acknowledgments +normally appear. + +4. WARRANTY DISCLAIMER. THE SOFTWARE IS SUPPLIED "AS IS" +WITHOUT WARRANTY OF ANY KIND. THE COPYRIGHT HOLDER, THE +UNITED STATES, THE UNITED STATES DEPARTMENT OF ENERGY, AND +THEIR EMPLOYEES: (1) DISCLAIM ANY WARRANTIES, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO ANY IMPLIED WARRANTIES +OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, TITLE +OR NON-INFRINGEMENT, (2) DO NOT ASSUME ANY LEGAL LIABILITY +OR RESPONSIBILITY FOR THE ACCURACY, COMPLETENESS, OR +USEFULNESS OF THE SOFTWARE, (3) DO NOT REPRESENT THAT USE OF +THE SOFTWARE WOULD NOT INFRINGE PRIVATELY OWNED RIGHTS, (4) +DO NOT WARRANT THAT THE SOFTWARE WILL FUNCTION +UNINTERRUPTED, THAT IT IS ERROR-FREE OR THAT ANY ERRORS WILL +BE CORRECTED. + +5. LIMITATION OF LIABILITY. IN NO EVENT WILL THE COPYRIGHT +HOLDER, THE UNITED STATES, THE UNITED STATES DEPARTMENT OF +ENERGY, OR THEIR EMPLOYEES: BE LIABLE FOR ANY INDIRECT, +INCIDENTAL, CONSEQUENTIAL, SPECIAL OR PUNITIVE DAMAGES OF +ANY KIND OR NATURE, INCLUDING BUT NOT LIMITED TO LOSS OF +PROFITS OR LOSS OF DATA, FOR ANY REASON WHATSOEVER, WHETHER +SUCH LIABILITY IS ASSERTED ON THE BASIS OF CONTRACT, TORT +(INCLUDING NEGLIGENCE OR STRICT LIABILITY), OR OTHERWISE, +EVEN IF ANY OF SAID PARTIES HAS BEEN WARNED OF THE +POSSIBILITY OF SUCH LOSS OR DAMAGES. diff --git a/be/src/thirdparty/mpfit/README b/be/src/thirdparty/mpfit/README new file mode 100644 index 0000000000..29a9cef461 --- /dev/null +++ b/be/src/thirdparty/mpfit/README @@ -0,0 +1,705 @@ + +MPFIT: A MINPACK-1 Least Squares Fitting Library in C + +Original public domain version by B. Garbow, K. Hillstrom, J. More' + (Argonne National Laboratory, MINPACK project, March 1980) + +Tranlation to C Language by S. Moshier (moshier.net) + +Enhancements, documentation and packaging by C. Markwardt + (comparable to IDL fitting routine MPFIT + see http://cow.physics.wisc.edu/~craigm/idl/idl.html) + Copyright (C) 2003, 2004, 2006, 2007, 2009, 2010, 2013 Craig B. Markwardt + + +SUMMARY of CHANGES + 16 Feb 2009 - version 1.0 - initial release + 18 Feb 2009 - version 1.1 - add 'version' field to 'results' struct + 22 Nov 2009 - - allow to compile with C++ compiler + - change to memset() instead of nonstandard bzero() + - for Microsoft, proprietary equivalent of finite() + 04 Oct 2010 - - add static declarations, remove some compiler warnings + (reported by Lars Kr. Lundin) + 13 Nov 2010 - version 1.2 - additional documentation, cleanup of mpfit.h + 23 Apr 2013 - version 1.3 - add MP_NO_ITER; bug fix mpside=2 when debugging + (thanks to M. Wojdyr) + +$Id: README,v 1.18 2016/06/02 19:14:16 craigm Exp $ + +INTRODUCTION + +MPFIT uses the Levenberg-Marquardt technique to solve the +least-squares problem. In its typical use, MPFIT will be used to +fit a user-supplied function (the "model") to user-supplied data +points (the "data") by adjusting a set of parameters. MPFIT is +based upon MINPACK-1 (LMDIF.F) by More' and collaborators. + +For example, a researcher may think that a set of observed data +points is best modelled with a Gaussian curve. A Gaussian curve is +parameterized by its mean, standard deviation and normalization. +MPFIT will, within certain constraints, find the set of parameters +which best fits the data. The fit is "best" in the least-squares +sense; that is, the sum of the weighted squared differences between +the model and data is minimized. + +The Levenberg-Marquardt technique is a particular strategy for +iteratively searching for the best fit. This particular +implementation is drawn from a robust routine called MINPACK-1 (see +NETLIB). This version allows upper and lower bounding constraints +to be placed on each parameter, or the parameter can be held fixed. + +The user-supplied function should compute an array of weighted +deviations between model and data. In a typical scientific problem +the residuals should be weighted so that each deviate has a +gaussian sigma of 1.0. If X represents values of the independent +variable, Y represents a measurement for each value of X, and ERR +represents the error in the measurements, then the deviates could +be calculated as follows: + + for (i=0; ifunct is a C function which computes the +residuals using any user data that is required. The number of +residuals is specified by the integer m. The nature and +properties of user function are described in greater detail below. + +The user function parameters are passed to mpfit as the array + double xall[npar]; +where npar is the number of parameters of the user function. +It must be the case that m > npar, i.e. there are more data points +than free parameters. The user must pass an initial "best guess" of +the user parameters to mpfit(), and upon successful return, the xall[] +array will contain the "best fit" parameters of the fit (and the original +values are overwritten). + +The user function is responsible to compute an array of generic +residuals. Usually these residuals will depend on user-dependent +quantities such as measured data or independent variables such "x" +when fitting a function of the form y(x). The user should pass these +quantities using the optional private_data pointer. If +private_data is not used then a null pointer should be passed. +Otherwise, it can be any C pointer, typically a pointer to a structure +such as this: + struct example_private_data { /* EXAMPLE: fitting y(x) */ + double *x; /* x - independent variable of model */ + double *y; /* y - measured "y" values */ + double *y_error; /* y_error - measurement uncertainty in y */ + }; +The user would be responsible to declare such a structure, and to fill +it with pointers to the relevant data arrays before calling mpfit(). +mpfit() itself does not inspect or modify the private_data contents, +but merely passes it along to the user function + +The structure pars is optional, and allows the user to specify +additional information and constraints about each user function +parameter. For example, the user can specify simple bounding +constraints. If passed, then pars[] must be dimensioned as, + mp_par pars[npar]; +where mp_par is a structure defined in mpfit.h. If no special +parameter information is necessary, then the user can pass +pars == 0. + +The optional structure config configures how mpfit() behaves, +and is described in greater detail below. By passing a null pointer, +the user obtains the default behavior. + +The structure result contains results of the fit, returned by +mpfit(). The user should pass a pointer to a structure of type +'mp_result' (which should be zeroed), and upon return, the structure +is filled with various diagnostic quantities about the fitting run. +These quantities are described in greater detail below. If these +diagnostics are not required, then the user may pass a null pointer. + + +USER FUNCTION + +The user must define a function which computes the appropriate values +as specified above. The function must compute the weighted deviations +between the model and the data. The user function may also optionally +compute explicit derivatives (see below). The user function should +be of the following form: + + int myfunct(int m, int n, double *p, double *deviates, + double **derivs, void *private) + { + int i; + /* Compute function deviates */ + for (i=0; i= 10 */ + + memset(&result, 0, sizeof(result)); + status = mpfit(myfunct, m, n, p, pars, 0, 0, &result); + + +EXAMPLE 6 - Increasing maximum number of iterations + +This example changes the maximum number of iterations from its default +to 1000. + + mp_config config; + mp_result result; + + memset(&config, 0, sizeof(config)); + config.maxiter = 1000; + memset(&result, 0, sizeof(result)); + status = mpfit(myfunct, m, n, p, 0, &config, 0, &result); + + +EXAMPLE 7 - Passing private data to user function + +This example passes "private" data to its user function using the +private parameter. It assumes that three variables, x, y, and ey, +already exist, and that the user function will know what to do with +them. + + mp_result result; + struct mystruct { + double *x; + double *y; + double *ey; + }; + + struct mystruct mydata; + + mydata.x = x; + mydata.y = y; + mydata.ey = ey; + + memset(&result, 0, sizeof(result)); + status = mpfit(myfunct, m, n, p, 0, 0, (void *)&mydata, &result); + + +EXAMPLE 8 - Complete example + + +This example shows how to fit sample data to a linear function +y = f(x) = a - b*x, where a and b are fitted parameters. There is sample +data included within the program. The parameters are initialized to +a = 1.0, b = 1.0, and then the fitting occurs. The function +residuals; within linfunc(), the value of f(x) is computed. The main +routine, main() initializes the variables and calls mpfit(). + + +#include "mpfit.h" +#include +#include +#include + +/* This is the private data structure which contains the data points + and their uncertainties */ +struct vars_struct { + double *x; + double *y; + double *ey; +}; + +/* + * linear fit function + * + * m - number of data points + * n - number of parameters (2) + * p - array of fit parameters + * dy - array of residuals to be returned + * vars - private data (struct vars_struct *) + * + * RETURNS: error code (0 = success) + */ +int linfunc(int m, int n, double *p, double *dy, double **dvec, void *vars) +{ + int i; + struct vars_struct *v = (struct vars_struct *) vars; + double *x, *y, *ey, f; + + x = v->x; + y = v->y; + ey = v->ey; + + for (i=0; i +#include +#include +#include +#include "mpfit.h" + +/* Forward declarations of functions in this module */ +static int mp_fdjac2(mp_func funct, + int m, int n, int *ifree, int npar, double *x, double *fvec, + double *fjac, int ldfjac, double epsfcn, + double *wa, void *priv, int *nfev, + double *step, double *dstep, int *dside, + int *qulimited, double *ulimit, + int *ddebug, double *ddrtol, double *ddatol, + double *wa2, double **dvecptr); +static void mp_qrfac(int m, int n, double *a, int lda, + int pivot, int *ipvt, int lipvt, + double *rdiag, double *acnorm, double *wa); +static void mp_qrsolv(int n, double *r, int ldr, int *ipvt, double *diag, + double *qtb, double *x, double *sdiag, double *wa); +static void mp_lmpar(int n, double *r, int ldr, int *ipvt, int *ifree, double *diag, + double *qtb, double delta, double *par, double *x, + double *sdiag, double *wa1, double *wa2); +static double mp_enorm(int n, double *x); +static double mp_dmax1(double a, double b); +static double mp_dmin1(double a, double b); +static int mp_min0(int a, int b); +static int mp_covar(int n, double *r, int ldr, int *ipvt, double tol, double *wa); + +/* Macro to call user function */ +#define mp_call(funct, m, n, x, fvec, dvec, priv) (*(funct))(m,n,x,fvec,dvec,priv) + +/* Macro to safely allocate memory */ +#define mp_malloc(dest,type,size) \ + dest = (type *) malloc( sizeof(type)*size ); \ + if (dest == 0) { \ + info = MP_ERR_MEMORY; \ + goto CLEANUP; \ + } else { \ + int _k; \ + for (_k=0; _k<(size); _k++) dest[_k] = 0; \ + } + +/* +* ********** +* +* subroutine mpfit +* +* the purpose of mpfit is to minimize the sum of the squares of +* m nonlinear functions in n variables by a modification of +* the levenberg-marquardt algorithm. the user must provide a +* subroutine which calculates the functions. the jacobian is +* then calculated by a finite-difference approximation. +* +* mp_funct funct - function to be minimized +* int m - number of data points +* int npar - number of fit parameters +* double *xall - array of n initial parameter values +* upon return, contains adjusted parameter values +* mp_par *pars - array of npar structures specifying constraints; +* or 0 (null pointer) for unconstrained fitting +* [ see README and mpfit.h for definition & use of mp_par] +* mp_config *config - pointer to structure which specifies the +* configuration of mpfit(); or 0 (null pointer) +* if the default configuration is to be used. +* See README and mpfit.h for definition and use +* of config. +* void *private - any private user data which is to be passed directly +* to funct without modification by mpfit(). +* mp_result *result - pointer to structure, which upon return, contains +* the results of the fit. The user should zero this +* structure. If any of the array values are to be +* returned, the user should allocate storage for them +* and assign the corresponding pointer in *result. +* Upon return, *result will be updated, and +* any of the non-null arrays will be filled. +* +* +* FORTRAN DOCUMENTATION BELOW +* +* +* the subroutine statement is +* +* subroutine lmdif(fcn,m,n,x,fvec,ftol,xtol,gtol,maxfev,epsfcn, +* diag,mode,factor,nprint,info,nfev,fjac, +* ldfjac,ipvt,qtf,wa1,wa2,wa3,wa4) +* +* where +* +* fcn is the name of the user-supplied subroutine which +* calculates the functions. fcn must be declared +* in an external statement in the user calling +* program, and should be written as follows. +* +* subroutine fcn(m,n,x,fvec,iflag) +* integer m,n,iflag +* double precision x(n),fvec(m) +* ---------- +* calculate the functions at x and +* return this vector in fvec. +* ---------- +* return +* end +* +* the value of iflag should not be changed by fcn unless +* the user wants to terminate execution of lmdif. +* in this case set iflag to a negative integer. +* +* m is a positive integer input variable set to the number +* of functions. +* +* n is a positive integer input variable set to the number +* of variables. n must not exceed m. +* +* x is an array of length n. on input x must contain +* an initial estimate of the solution vector. on output x +* contains the final estimate of the solution vector. +* +* fvec is an output array of length m which contains +* the functions evaluated at the output x. +* +* ftol is a nonnegative input variable. termination +* occurs when both the actual and predicted relative +* reductions in the sum of squares are at most ftol. +* therefore, ftol measures the relative error desired +* in the sum of squares. +* +* xtol is a nonnegative input variable. termination +* occurs when the relative error between two consecutive +* iterates is at most xtol. therefore, xtol measures the +* relative error desired in the approximate solution. +* +* gtol is a nonnegative input variable. termination +* occurs when the cosine of the angle between fvec and +* any column of the jacobian is at most gtol in absolute +* value. therefore, gtol measures the orthogonality +* desired between the function vector and the columns +* of the jacobian. +* +* maxfev is a positive integer input variable. termination +* occurs when the number of calls to fcn is at least +* maxfev by the end of an iteration. +* +* epsfcn is an input variable used in determining a suitable +* step length for the forward-difference approximation. this +* approximation assumes that the relative errors in the +* functions are of the order of epsfcn. if epsfcn is less +* than the machine precision, it is assumed that the relative +* errors in the functions are of the order of the machine +* precision. +* +* diag is an array of length n. if mode = 1 (see +* below), diag is internally set. if mode = 2, diag +* must contain positive entries that serve as +* multiplicative scale factors for the variables. +* +* mode is an integer input variable. if mode = 1, the +* variables will be scaled internally. if mode = 2, +* the scaling is specified by the input diag. other +* values of mode are equivalent to mode = 1. +* +* factor is a positive input variable used in determining the +* initial step bound. this bound is set to the product of +* factor and the euclidean norm of diag*x if nonzero, or else +* to factor itself. in most cases factor should lie in the +* interval (.1,100.). 100. is a generally recommended value. +* +* nprint is an integer input variable that enables controlled +* printing of iterates if it is positive. in this case, +* fcn is called with iflag = 0 at the beginning of the first +* iteration and every nprint iterations thereafter and +* immediately prior to return, with x and fvec available +* for printing. if nprint is not positive, no special calls +* of fcn with iflag = 0 are made. +* +* info is an integer output variable. if the user has +* terminated execution, info is set to the (negative) +* value of iflag. see description of fcn. otherwise, +* info is set as follows. +* +* info = 0 improper input parameters. +* +* info = 1 both actual and predicted relative reductions +* in the sum of squares are at most ftol. +* +* info = 2 relative error between two consecutive iterates +* is at most xtol. +* +* info = 3 conditions for info = 1 and info = 2 both hold. +* +* info = 4 the cosine of the angle between fvec and any +* column of the jacobian is at most gtol in +* absolute value. +* +* info = 5 number of calls to fcn has reached or +* exceeded maxfev. +* +* info = 6 ftol is too small. no further reduction in +* the sum of squares is possible. +* +* info = 7 xtol is too small. no further improvement in +* the approximate solution x is possible. +* +* info = 8 gtol is too small. fvec is orthogonal to the +* columns of the jacobian to machine precision. +* +* nfev is an integer output variable set to the number of +* calls to fcn. +* +* fjac is an output m by n array. the upper n by n submatrix +* of fjac contains an upper triangular matrix r with +* diagonal elements of nonincreasing magnitude such that +* +* t t t +* p *(jac *jac)*p = r *r, +* +* where p is a permutation matrix and jac is the final +* calculated jacobian. column j of p is column ipvt(j) +* (see below) of the identity matrix. the lower trapezoidal +* part of fjac contains information generated during +* the computation of r. +* +* ldfjac is a positive integer input variable not less than m +* which specifies the leading dimension of the array fjac. +* +* ipvt is an integer output array of length n. ipvt +* defines a permutation matrix p such that jac*p = q*r, +* where jac is the final calculated jacobian, q is +* orthogonal (not stored), and r is upper triangular +* with diagonal elements of nonincreasing magnitude. +* column j of p is column ipvt(j) of the identity matrix. +* +* qtf is an output array of length n which contains +* the first n elements of the vector (q transpose)*fvec. +* +* wa1, wa2, and wa3 are work arrays of length n. +* +* wa4 is a work array of length m. +* +* subprograms called +* +* user-supplied ...... fcn +* +* minpack-supplied ... dpmpar,enorm,fdjac2,lmpar,qrfac +* +* fortran-supplied ... dabs,dmax1,dmin1,dsqrt,mod +* +* argonne national laboratory. minpack project. march 1980. +* burton s. garbow, kenneth e. hillstrom, jorge j. more +* +* ********** */ + + +int mpfit(mp_func funct, int m, int npar, + double *xall, mp_par *pars, mp_config *config, void *private_data, + mp_result *result) +{ + mp_config conf; + int i, j, info, iflag, nfree, npegged, iter; + int qanylim = 0; + + int ij,jj,l; + double actred,delta,dirder,fnorm,fnorm1,gnorm, orignorm; + double par,pnorm,prered,ratio; + double sum,temp,temp1,temp2,temp3,xnorm, alpha; + static double one = 1.0; + static double p1 = 0.1; + static double p5 = 0.5; + static double p25 = 0.25; + static double p75 = 0.75; + static double p0001 = 1.0e-4; + static double zero = 0.0; + int nfev = 0; + + double *step = 0, *dstep = 0, *llim = 0, *ulim = 0; + int *pfixed = 0, *mpside = 0, *ifree = 0, *qllim = 0, *qulim = 0; + int *ddebug = 0; + double *ddrtol = 0, *ddatol = 0; + + double *fvec = 0, *qtf = 0; + double *x = 0, *xnew = 0, *fjac = 0, *diag = 0; + double *wa1 = 0, *wa2 = 0, *wa3 = 0, *wa4 = 0; + double **dvecptr = 0; + int *ipvt = 0; + + int ldfjac; + + /* Default configuration */ + conf.ftol = 1e-10; + conf.xtol = 1e-10; + conf.gtol = 1e-10; + conf.stepfactor = 100.0; + conf.nprint = 1; + conf.epsfcn = MP_MACHEP0; + conf.maxiter = 200; + conf.douserscale = 0; + conf.maxfev = 0; + conf.covtol = 1e-14; + conf.nofinitecheck = 0; + + if (config) { + /* Transfer any user-specified configurations */ + if (config->ftol > 0) conf.ftol = config->ftol; + if (config->xtol > 0) conf.xtol = config->xtol; + if (config->gtol > 0) conf.gtol = config->gtol; + if (config->stepfactor > 0) conf.stepfactor = config->stepfactor; + if (config->nprint >= 0) conf.nprint = config->nprint; + if (config->epsfcn > 0) conf.epsfcn = config->epsfcn; + if (config->maxiter > 0) conf.maxiter = config->maxiter; + if (config->maxiter == MP_NO_ITER) conf.maxiter = 0; + if (config->douserscale != 0) conf.douserscale = config->douserscale; + if (config->covtol > 0) conf.covtol = config->covtol; + if (config->nofinitecheck > 0) conf.nofinitecheck = config->nofinitecheck; + conf.maxfev = config->maxfev; + } + + info = MP_ERR_INPUT; /* = 0 */ + iflag = 0; + nfree = 0; + npegged = 0; + + /* Basic error checking */ + if (funct == 0) { + return MP_ERR_FUNC; + } + + if ((m <= 0) || (xall == 0)) { + return MP_ERR_NPOINTS; + } + + if (npar <= 0) { + return MP_ERR_NFREE; + } + + fnorm = -1.0; + fnorm1 = -1.0; + xnorm = -1.0; + delta = 0.0; + + /* FIXED parameters? */ + mp_malloc(pfixed, int, npar); + if (pars) for (i=0; i pars[i].limits[1])) ) { + info = MP_ERR_INITBOUNDS; + goto CLEANUP; + } + if ( (pars[i].fixed == 0) && pars[i].limited[0] && pars[i].limited[1] && + (pars[i].limits[0] >= pars[i].limits[1])) { + info = MP_ERR_BOUNDS; + goto CLEANUP; + } + } + + mp_malloc(qulim, int, nfree); + mp_malloc(qllim, int, nfree); + mp_malloc(ulim, double, nfree); + mp_malloc(llim, double, nfree); + + for (i=0; i 0)) { + ij = j*ldfjac; + for (i=0; i= ulim[j])); + int dwa1 = fabs(wa1[j]) > MP_MACHEP0; + + if (lpegged && (wa1[j] < 0)) wa1[j] = 0; + if (upegged && (wa1[j] > 0)) wa1[j] = 0; + + if (dwa1 && qllim[j] && ((x[j] + wa1[j]) < llim[j])) { + alpha = mp_dmin1(alpha, (llim[j]-x[j])/wa1[j]); + } + if (dwa1 && qulim[j] && ((x[j] + wa1[j]) > ulim[j])) { + alpha = mp_dmin1(alpha, (ulim[j]-x[j])/wa1[j]); + } + } + + /* Scale the resulting vector, advance to the next position */ + for (j=0; j= 0) ? (+1) : (-1); + sgnl = (llim[j] >= 0) ? (+1) : (-1); + ulim1 = ulim[j]*(1-sgnu*MP_MACHEP0) - ((ulim[j] == 0)?(MP_MACHEP0):0); + llim1 = llim[j]*(1+sgnl*MP_MACHEP0) + ((llim[j] == 0)?(MP_MACHEP0):0); + + if (qulim[j] && (wa2[j] >= ulim1)) { + wa2[j] = ulim[j]; + } + if (qllim[j] && (wa2[j] <= llim1)) { + wa2[j] = llim[j]; + } + } + + } + + for (j=0; j= zero) { + temp = p5; + } else { + temp = p5*dirder/(dirder + p5*actred); + } + if (((p1*fnorm1) >= fnorm) + || (temp < p1) ) { + temp = p1; + } + delta = temp*mp_dmin1(delta,pnorm/p1); + par = par/temp; + } else { + if ((par == zero) || (ratio >= p75) ) { + delta = pnorm/p5; + par = p5*par; + } + } + + /* + * test for successful iteration. + */ + if (ratio >= p0001) { + + /* + * successful iteration. update x, fvec, and their norms. + */ + for (j=0; j 0) && (nfev >= conf.maxfev)) { + /* Too many function evaluations */ + info = MP_MAXITER; + } + if (iter >= conf.maxiter) { + /* Too many iterations */ + info = MP_MAXITER; + } + if ((fabs(actred) <= MP_MACHEP0) && (prered <= MP_MACHEP0) && (p5*ratio <= one) ) { + info = MP_FTOL; + } + if (delta <= MP_MACHEP0*xnorm) { + info = MP_XTOL; + } + if (gnorm <= MP_MACHEP0) { + info = MP_GTOL; + } + if (info != 0) { + goto L300; + } + + /* + * end of the inner loop. repeat if iteration unsuccessful. + */ + if (ratio < p0001) goto L200; + /* + * end of the outer loop. + */ + goto OUTER_LOOP; + + L300: + /* + * termination, either normal or user imposed. + */ + if (iflag < 0) { + info = iflag; + } + iflag = 0; + + for (i=0; i 0) && (info > 0)) { + iflag = mp_call(funct, m, npar, xall, fvec, 0, private_data); + nfev += 1; + } + + /* Compute number of pegged parameters */ + npegged = 0; + if (pars) for (i=0; icovar || result->xerror)) { + mp_covar(nfree, fjac, ldfjac, ipvt, conf.covtol, wa2); + + if (result->covar) { + /* Zero the destination covariance array */ + for (j=0; j<(npar*npar); j++) result->covar[j] = 0; + + /* Transfer the covariance array */ + for (j=0; jcovar[ifree[j]*npar+ifree[i]] = fjac[j*ldfjac+i]; + } + } + } + + if (result->xerror) { + for (j=0; jxerror[j] = 0; + + for (j=0; j 0) result->xerror[ifree[j]] = sqrt(cc); + } + } + } + + if (result) { + strcpy(result->version, MPFIT_VERSION); + result->bestnorm = mp_dmax1(fnorm,fnorm1); + result->bestnorm *= result->bestnorm; + result->orignorm = orignorm; + result->status = info; + result->niter = iter; + result->nfev = nfev; + result->npar = npar; + result->nfree = nfree; + result->npegged = npegged; + result->nfunc = m; + + /* Copy residuals if requested */ + if (result->resid) { + for (j=0; jresid[j] = fvec[j]; + } + } + + + CLEANUP: + if (fvec) free(fvec); + if (qtf) free(qtf); + if (x) free(x); + if (xnew) free(xnew); + if (fjac) free(fjac); + if (diag) free(diag); + if (wa1) free(wa1); + if (wa2) free(wa2); + if (wa3) free(wa3); + if (wa4) free(wa4); + if (ipvt) free(ipvt); + if (pfixed) free(pfixed); + if (step) free(step); + if (dstep) free(dstep); + if (mpside) free(mpside); + if (ddebug) free(ddebug); + if (ddrtol) free(ddrtol); + if (ddatol) free(ddatol); + if (ifree) free(ifree); + if (qllim) free(qllim); + if (qulim) free(qulim); + if (llim) free(llim); + if (ulim) free(ulim); + if (dvecptr) free(dvecptr); + + return info; +} + + +/************************fdjac2.c*************************/ + +static +int mp_fdjac2(mp_func funct, + int m, int n, int *ifree, int npar, double *x, double *fvec, + double *fjac, int ldfjac, double epsfcn, + double *wa, void *priv, int *nfev, + double *step, double *dstep, int *dside, + int *qulimited, double *ulimit, + int *ddebug, double *ddrtol, double *ddatol, + double *wa2, double **dvec) +{ +/* +* ********** +* +* subroutine fdjac2 +* +* this subroutine computes a forward-difference approximation +* to the m by n jacobian matrix associated with a specified +* problem of m functions in n variables. +* +* the subroutine statement is +* +* subroutine fdjac2(fcn,m,n,x,fvec,fjac,ldfjac,iflag,epsfcn,wa) +* +* where +* +* fcn is the name of the user-supplied subroutine which +* calculates the functions. fcn must be declared +* in an external statement in the user calling +* program, and should be written as follows. +* +* subroutine fcn(m,n,x,fvec,iflag) +* integer m,n,iflag +* double precision x(n),fvec(m) +* ---------- +* calculate the functions at x and +* return this vector in fvec. +* ---------- +* return +* end +* +* the value of iflag should not be changed by fcn unless +* the user wants to terminate execution of fdjac2. +* in this case set iflag to a negative integer. +* +* m is a positive integer input variable set to the number +* of functions. +* +* n is a positive integer input variable set to the number +* of variables. n must not exceed m. +* +* x is an input array of length n. +* +* fvec is an input array of length m which must contain the +* functions evaluated at x. +* +* fjac is an output m by n array which contains the +* approximation to the jacobian matrix evaluated at x. +* +* ldfjac is a positive integer input variable not less than m +* which specifies the leading dimension of the array fjac. +* +* iflag is an integer variable which can be used to terminate +* the execution of fdjac2. see description of fcn. +* +* epsfcn is an input variable used in determining a suitable +* step length for the forward-difference approximation. this +* approximation assumes that the relative errors in the +* functions are of the order of epsfcn. if epsfcn is less +* than the machine precision, it is assumed that the relative +* errors in the functions are of the order of the machine +* precision. +* +* wa is a work array of length m. +* +* subprograms called +* +* user-supplied ...... fcn +* +* minpack-supplied ... dpmpar +* +* fortran-supplied ... dabs,dmax1,dsqrt +* +* argonne national laboratory. minpack project. march 1980. +* burton s. garbow, kenneth e. hillstrom, jorge j. more +* + ********** +*/ + int i,j,ij; + int iflag = 0; + double eps,h,temp; + static double zero = 0.0; + int has_analytical_deriv = 0, has_numerical_deriv = 0; + int has_debug_deriv = 0; + + temp = mp_dmax1(epsfcn,MP_MACHEP0); + eps = sqrt(temp); + ij = 0; + ldfjac = 0; /* Prevent compiler warning */ + if (ldfjac){} /* Prevent compiler warning */ + + for (j=0; j 0) h = step[ifree[j]]; + if (dstep && dstep[ifree[j]] > 0) h = fabs(dstep[ifree[j]]*temp); + if (h == zero) h = eps; + + /* If negative step requested, or we are against the upper limit */ + if ((dside && dsidei == -1) || + (dside && dsidei == 0 && + qulimited && ulimit && qulimited[j] && + (temp > (ulimit[j]-h)))) { + h = -h; + } + + x[ifree[j]] = temp + h; + iflag = mp_call(funct, m, npar, x, wa, 0, priv); + if (nfev) *nfev = *nfev + 1; + if (iflag < 0 ) goto DONE; + x[ifree[j]] = temp; + + if (dsidei <= 1) { + /* COMPUTE THE ONE-SIDED DERIVATIVE */ + if (! debug) { + /* Non-debug path for speed */ + for (i=0; i da + fabs(fjold)*dr))) { + printf(" %10d %10.4g %10.4g %10.4g %10.4g %10.4g\n", + i, fvec[i], fjold, fjac[ij], fjold-fjac[ij], + (fjold == 0)?(0):((fjold-fjac[ij])/fjold)); + } + } + } /* end debugging */ + + } else { /* dside > 2 */ + /* COMPUTE THE TWO-SIDED DERIVATIVE */ + for (i=0; i da + fabs(fjold)*dr))) { + printf(" %10d %10.4g %10.4g %10.4g %10.4g %10.4g\n", + i, fvec[i], fjold, fjac[ij], fjold-fjac[ij], + (fjold == 0)?(0):((fjold-fjac[ij])/fjold)); + } + } + } /* end debugging */ + + } /* if (dside > 2) */ + } /* if (has_numerical_derivative) */ + + if (has_debug_deriv) { + printf("FJAC DEBUG END\n"); + } + + DONE: + if (iflag < 0) return iflag; + return 0; + /* + * last card of subroutine fdjac2. + */ +} + + +/************************qrfac.c*************************/ + +static +void mp_qrfac(int m, int n, double *a, int lda, + int pivot, int *ipvt, int lipvt, + double *rdiag, double *acnorm, double *wa) +{ +/* +* ********** +* +* subroutine qrfac +* +* this subroutine uses householder transformations with column +* pivoting (optional) to compute a qr factorization of the +* m by n matrix a. that is, qrfac determines an orthogonal +* matrix q, a permutation matrix p, and an upper trapezoidal +* matrix r with diagonal elements of nonincreasing magnitude, +* such that a*p = q*r. the householder transformation for +* column k, k = 1,2,...,min(m,n), is of the form +* +* t +* i - (1/u(k))*u*u +* +* where u has zeros in the first k-1 positions. the form of +* this transformation and the method of pivoting first +* appeared in the corresponding linpack subroutine. +* +* the subroutine statement is +* +* subroutine qrfac(m,n,a,lda,pivot,ipvt,lipvt,rdiag,acnorm,wa) +* +* where +* +* m is a positive integer input variable set to the number +* of rows of a. +* +* n is a positive integer input variable set to the number +* of columns of a. +* +* a is an m by n array. on input a contains the matrix for +* which the qr factorization is to be computed. on output +* the strict upper trapezoidal part of a contains the strict +* upper trapezoidal part of r, and the lower trapezoidal +* part of a contains a factored form of q (the non-trivial +* elements of the u vectors described above). +* +* lda is a positive integer input variable not less than m +* which specifies the leading dimension of the array a. +* +* pivot is a logical input variable. if pivot is set true, +* then column pivoting is enforced. if pivot is set false, +* then no column pivoting is done. +* +* ipvt is an integer output array of length lipvt. ipvt +* defines the permutation matrix p such that a*p = q*r. +* column j of p is column ipvt(j) of the identity matrix. +* if pivot is false, ipvt is not referenced. +* +* lipvt is a positive integer input variable. if pivot is false, +* then lipvt may be as small as 1. if pivot is true, then +* lipvt must be at least n. +* +* rdiag is an output array of length n which contains the +* diagonal elements of r. +* +* acnorm is an output array of length n which contains the +* norms of the corresponding columns of the input matrix a. +* if this information is not needed, then acnorm can coincide +* with rdiag. +* +* wa is a work array of length n. if pivot is false, then wa +* can coincide with rdiag. +* +* subprograms called +* +* minpack-supplied ... dpmpar,enorm +* +* fortran-supplied ... dmax1,dsqrt,min0 +* +* argonne national laboratory. minpack project. march 1980. +* burton s. garbow, kenneth e. hillstrom, jorge j. more +* +* ********** +*/ + int i,ij,jj,j,jp1,k,kmax,minmn; + double ajnorm,sum,temp; + static double zero = 0.0; + static double one = 1.0; + static double p05 = 0.05; + + lda = 0; /* Prevent compiler warning */ + lipvt = 0; /* Prevent compiler warning */ + if (lda) {} /* Prevent compiler warning */ + if (lipvt) {} /* Prevent compiler warning */ + + /* + * compute the initial column norms and initialize several arrays. + */ + ij = 0; + for (j=0; j rdiag[kmax]) + kmax = k; + } + if (kmax == j) + goto L40; + + ij = m * j; + jj = m * kmax; + for (i=0; i kp1) + { + ik = kk + 1; + for (i=kp1; i jp1) + { + ij = jp1 + ldr * j; + for (i=jp1; i= 1) { + for (k=0; k= 0) + { + ij = ldr * j; + for (i=0; i<=jm1; i++) + { + wa1[i] -= r[ij]*temp; + ij += 1; + } + } + } + } + + for (j=0; j= n) { + for (j=0; j= 0) + { + ij = jj; + for (i=0; i<=jm1; i++) + { + sum += r[ij]*wa1[i]; + ij += 1; + } + } + wa1[j] = (wa1[j] - sum)/r[j+ldr*j]; + jj += ldr; /* [i+ldr*j] */ + } + temp = mp_enorm(n,wa1); + parl = ((fp/delta)/temp)/temp; + } + /* + * calculate an upper bound, paru, for the zero of the function. + */ + jj = 0; + for (j=0; j zero) + parl = mp_dmax1(parl, *par); + if (fp < zero) + paru = mp_dmin1(paru, *par); + /* + * compute an improved estimate for par. + */ + *par = mp_dmax1(parl, *par + parc); + /* + * end of an iteration. + */ + goto L150; + + L220: + /* + * termination. + */ + if (iter == 0) + *par = zero; + /* + * last card of subroutine lmpar. + */ +} + + +/************************enorm.c*************************/ + +static +double mp_enorm(int n, double *x) +{ + /* + * ********** + * + * function enorm + * + * given an n-vector x, this function calculates the + * euclidean norm of x. + * + * the euclidean norm is computed by accumulating the sum of + * squares in three different sums. the sums of squares for the + * small and large components are scaled so that no overflows + * occur. non-destructive underflows are permitted. underflows + * and overflows do not occur in the computation of the unscaled + * sum of squares for the intermediate components. + * the definitions of small, intermediate and large components + * depend on two constants, rdwarf and rgiant. the main + * restrictions on these constants are that rdwarf**2 not + * underflow and rgiant**2 not overflow. the constants + * given here are suitable for every known computer. + * + * the function statement is + * + * double precision function enorm(n,x) + * + * where + * + * n is a positive integer input variable. + * + * x is an input array of length n. + * + * subprograms called + * + * fortran-supplied ... dabs,dsqrt + * + * argonne national laboratory. minpack project. march 1980. + * burton s. garbow, kenneth e. hillstrom, jorge j. more + * + * ********** + */ + int i; + double agiant,floatn,s1,s2,s3,xabs,x1max,x3max; + double ans, temp; + double rdwarf = MP_RDWARF; + double rgiant = MP_RGIANT; + static double zero = 0.0; + static double one = 1.0; + + s1 = zero; + s2 = zero; + s3 = zero; + x1max = zero; + x3max = zero; + floatn = n; + agiant = rgiant/floatn; + + for (i=0; i rdwarf) && (xabs < agiant)) + { + /* + * sum for intermediate components. + */ + s2 += xabs*xabs; + continue; + } + + if (xabs > rdwarf) + { + /* + * sum for large components. + */ + if (xabs > x1max) + { + temp = x1max/xabs; + s1 = one + s1*temp*temp; + x1max = xabs; + } + else + { + temp = xabs/x1max; + s1 += temp*temp; + } + continue; + } + /* + * sum for small components. + */ + if (xabs > x3max) + { + temp = x3max/xabs; + s3 = one + s3*temp*temp; + x3max = xabs; + } + else + { + if (xabs != zero) + { + temp = xabs/x3max; + s3 += temp*temp; + } + } + } + /* + * calculation of norm. + */ + if (s1 != zero) { + temp = s1 + (s2/x1max)/x1max; + ans = x1max*sqrt(temp); + return(ans); + } + if (s2 != zero) { + if (s2 >= x3max) + temp = s2*(one+(x3max/s2)*(x3max*s3)); + else + temp = x3max*((s2/x3max)+(x3max*s3)); + ans = sqrt(temp); + } + else + { + ans = x3max*sqrt(s3); + } + return(ans); + /* + * last card of function enorm. + */ +} + +/************************lmmisc.c*************************/ + +static +double mp_dmax1(double a, double b) +{ + if (a >= b) + return(a); + else + return(b); +} + +static +double mp_dmin1(double a, double b) +{ + if (a <= b) + return(a); + else + return(b); +} + +static +int mp_min0(int a, int b) +{ + if (a <= b) + return(a); + else + return(b); +} + +/************************covar.c*************************/ +/* +c ********** +c +c subroutine covar +c +c given an m by n matrix a, the problem is to determine +c the covariance matrix corresponding to a, defined as +c +c t +c inverse(a *a) . +c +c this subroutine completes the solution of the problem +c if it is provided with the necessary information from the +c qr factorization, with column pivoting, of a. that is, if +c a*p = q*r, where p is a permutation matrix, q has orthogonal +c columns, and r is an upper triangular matrix with diagonal +c elements of nonincreasing magnitude, then covar expects +c the full upper triangle of r and the permutation matrix p. +c the covariance matrix is then computed as +c +c t t +c p*inverse(r *r)*p . +c +c if a is nearly rank deficient, it may be desirable to compute +c the covariance matrix corresponding to the linearly independent +c columns of a. to define the numerical rank of a, covar uses +c the tolerance tol. if l is the largest integer such that +c +c abs(r(l,l)) .gt. tol*abs(r(1,1)) , +c +c then covar computes the covariance matrix corresponding to +c the first l columns of r. for k greater than l, column +c and row ipvt(k) of the covariance matrix are set to zero. +c +c the subroutine statement is +c +c subroutine covar(n,r,ldr,ipvt,tol,wa) +c +c where +c +c n is a positive integer input variable set to the order of r. +c +c r is an n by n array. on input the full upper triangle must +c contain the full upper triangle of the matrix r. on output +c r contains the square symmetric covariance matrix. +c +c ldr is a positive integer input variable not less than n +c which specifies the leading dimension of the array r. +c +c ipvt is an integer input array of length n which defines the +c permutation matrix p such that a*p = q*r. column j of p +c is column ipvt(j) of the identity matrix. +c +c tol is a nonnegative input variable used to define the +c numerical rank of a in the manner described above. +c +c wa is a work array of length n. +c +c subprograms called +c +c fortran-supplied ... dabs +c +c argonne national laboratory. minpack project. august 1980. +c burton s. garbow, kenneth e. hillstrom, jorge j. more +c +c ********** +*/ + +static +int mp_covar(int n, double *r, int ldr, int *ipvt, double tol, double *wa) +{ + int i, ii, j, jj, k, l; + int kk, kj, ji, j0, k0, jj0; + int sing; + double one = 1.0, temp, tolr, zero = 0.0; + + /* + * form the inverse of r in the full upper triangle of r. + */ + +#if 0 + for (j=0; j= 0) { + for (k=0; k <= l; k++) { + k0 = k*ldr; + + for (j=0; j l); + j0 = j*ldr; + jj0 = jj*ldr; + for (i=0; i<=j; i++) { + ji = j0+i; + + if (sing) r[ji] = zero; + ii = ipvt[i]; + if (ii > jj) r[jj0+ii] = r[ji]; + if (ii < jj) r[ii*ldr+jj] = r[ji]; + } + wa[jj] = r[j0+j]; + } + + /* + * Symmetrize the covariance matrix in r + */ + for (j=0; j= 199901L +#define mpfinite(x) isfinite(x) + +/* Microsoft C uses _finite(x) instead of finite(x) */ +#elif defined(_MSC_VER) && _MSC_VER +#include +#define mpfinite(x) _finite(x) + +/* Default is to assume that compiler/library has finite() function */ +#else +#define mpfinite(x) finite(x) + +#endif + +#ifdef __cplusplus +} /* extern "C" */ +#endif + +#endif /* MPFIT_H */ diff --git a/be/src/util/CMakeLists.txt b/be/src/util/CMakeLists.txt index 08002ed8ad..c9e0f3c012 100644 --- a/be/src/util/CMakeLists.txt +++ b/be/src/util/CMakeLists.txt @@ -17,6 +17,7 @@ set(SQUEASEL_SRC_DIR "${CMAKE_SOURCE_DIR}/be/src/thirdparty/squeasel") set(MUSTACHE_SRC_DIR "${CMAKE_SOURCE_DIR}/be/src/thirdparty/mustache") +set(MPFIT_SRC_DIR "${CMAKE_SOURCE_DIR}/be/src/thirdparty/mpfit") # where to put generated libraries set(LIBRARY_OUTPUT_PATH "${BUILD_OUTPUT_ROOT_DIRECTORY}/util") @@ -58,6 +59,7 @@ add_library(Util min-max-filter.cc min-max-filter-ir.cc minidump.cc + mpfit-util.cc network-util.cc openssl-util.cc os-info.cc @@ -86,6 +88,7 @@ add_library(Util ${SQUEASEL_SRC_DIR}/squeasel.c webserver.cc ${MUSTACHE_SRC_DIR}/mustache.cc + ${MPFIT_SRC_DIR}/mpfit.c ) add_dependencies(Util gen-deps gen_ir_descriptions) diff --git a/be/src/util/mpfit-util.cc b/be/src/util/mpfit-util.cc new file mode 100644 index 0000000000..0f001adaa7 --- /dev/null +++ b/be/src/util/mpfit-util.cc @@ -0,0 +1,82 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "util/mpfit-util.h" + +#include + +using namespace impala; +using std::string; +using std::function; + +ObjectiveFunction::ObjectiveFunction(string name, int num_params, + function fn) + : name_(name), num_params_(num_params), fn_(fn), num_points_(0), xs_(nullptr), + ys_(nullptr) { + DCHECK_GT(num_params, 0); +} + +/// User-supplied function called by MpFit during curve fitting. MpFit requires this +/// user-supplied function to compute "dy = y - fn(x)" for every known data point given +/// the objective function 'fn' and the current list of parameter values 'params'. +/// Declared here so it can be used in ObjectiveFunction below. +/// 'num_points' is the number of data points for fitting +/// 'num_params' is the number of parameters in the objective function +/// 'params' is an input array with 'num_params' parameter values +/// 'dy' is an output parameter allocated by MpFit. It contains the "delta y" for each +/// data point. This function sets dy[i] = y[i] - fn(x[i]) for each data point i and +/// for the objective function 'fn'. +/// 'obj_func' points to an ObjectiveFunction which contains the objective function +/// and the data points +/// The 'dvec' parameter is not important for our purposes. See the MpFit documentation. +/// MpFit allows these user-supplied functions to indicate errors by returning a non-zero +/// value. Our objective functions are simple and we always return 0. +static int ComputeDeltaY(int num_points, int num_params, double* params, double* dy, + double** dvec, void* obj_func); + +bool ObjectiveFunction::LmsFit(const double* xs, const double* ys, int num_points) { + DCHECK(xs != nullptr); + DCHECK(ys != nullptr); + DCHECK_GE(num_points, 0); + num_points_ = num_points; + xs_ = xs; + ys_ = ys; + + // Initialize function parameters. + params_.reset(new (std::nothrow) double[num_params_]); + if (params_ == nullptr) return false; + for (int i = 0; i < num_params_; ++i) params_.get()[i] = 1.0; + + struct mp_config_struct config; + memset(&config, 0, sizeof(config)); + // Maximum number of calls to SampledNdvMpFit(). Value determined empirically. + config.maxfev = 1000; + // Maximum number of fitting iterations. Value determined empirically. + config.maxiter = 200; + memset(&result_, 0, sizeof(result_)); + int ret = mpfit(ComputeDeltaY, num_points, num_params_, params_.get(), nullptr, + &config, this, &result_); + // Any positive integer indicates success. + return ret > 0; +} + +int ComputeDeltaY(int num_points, int num_params, double* params, double* dy, + double** dvec, void* obj_func) { + ObjectiveFunction* fn = reinterpret_cast(obj_func); + for (int i = 0; i < num_points; ++i) dy[i] = fn->GetDeltaY(i, params); + return 0; +} diff --git a/be/src/util/mpfit-util.h b/be/src/util/mpfit-util.h new file mode 100644 index 0000000000..9eac3c2907 --- /dev/null +++ b/be/src/util/mpfit-util.h @@ -0,0 +1,98 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + + +#ifndef IMPALA_MPFIT_UTIL_H +#define IMPALA_MPFIT_UTIL_H + +#include +#include +#include + +#include "common/logging.h" +#include "common/status.h" +#include "thirdparty/mpfit/mpfit.h" + +namespace impala { + +/// Objective function to be fit using the MpFit library. An objective function has +/// exactly one 'x' variable and any number of additional non-variable parameters whose +/// values are unknown. The purpose of curve fitting is to find the best values for those +/// parameters given an objective function and a series of x/y data points. +/// This class contains the objective function as well as the data points because that's +/// the most natural setup for the MpFit API. +/// Calling LmsFit() determines the parameters for the objective function. +/// Calling GetY() computes the 'y' value for a given 'x' using the function parameters. +/// The objective function is of type function +/// By convention, the first argument is the value of the 'x' variable and the second +/// argument is an array of function parameters which are determined during fitting. +class ObjectiveFunction { + public: + ObjectiveFunction(std::string name, int num_params, + std::function fn); + + /// Performs least mean squares (LMS) curve fitting using the MpFit library + /// against the provided x/y data points. + /// Returns true if fitting was successful, false otherwise. + bool LmsFit(const double* xs, const double* ys, int num_points) WARN_UNUSED_RESULT; + + /// Evaluates the objective function over the given 'x' value. + double GetY(int64_t x) const { + DCHECK(params_ != nullptr); + return fn_(x, params_.get()); + } + + /// Returns the difference between the y value of data point 'pidx' and the + /// y value of the objective function with the given parameters over the x value + /// of the same point. + double GetDeltaY(int pidx, const double* params) const { + DCHECK_LT(pidx, num_points_); + return ys_[pidx] - fn_(xs_[pidx], params); + } + + /// Returns the Chi-Square of fitting. This is an indication of how well the function + /// fits. Lower is better. Valid to call after LmsFit(). + double GetError() const { + DCHECK(params_ != nullptr); + return result_.bestnorm; + } + + private: + /// Human-readable name of this function. Used for debugging. + std::string name_; + + /// Function parameters to be determined by fitting. + const int num_params_; + std::unique_ptr params_; + + /// MPFit result structure. Populated by in LmsFit(). All pointers in this structure + /// are optional and must be allocated and owned by the caller of mpfit(). Passing + /// nullptr indicates to MPFit that those fields should not be populated. + mp_result result_; + + /// Objective function whose parameters should be fit to the data points. + std::function fn_; + + /// Known x/y data points. Memory not owned. + int num_points_; + const double* xs_; + const double* ys_; +}; + +} + +#endif diff --git a/bin/rat_exclude_files.txt b/bin/rat_exclude_files.txt index 73cd0739f3..fdb19266f9 100644 --- a/bin/rat_exclude_files.txt +++ b/bin/rat_exclude_files.txt @@ -19,8 +19,9 @@ testdata/__init__.py tests/__init__.py www/index.html -# See LICENSE.txt +# See $IMPALA_HOME/LICENSE.txt be/src/gutil/* +be/src/thirdparty/mpfit/* be/src/kudu/gutil www/highlight/* www/DataTables*/* diff --git a/bin/run_clang_tidy.sh b/bin/run_clang_tidy.sh index ae389a9bae..e879b353d8 100755 --- a/bin/run_clang_tidy.sh +++ b/bin/run_clang_tidy.sh @@ -35,7 +35,10 @@ then echo "WARNING: compile failed" >&2 fi -DIRS=$(ls -d "${IMPALA_HOME}/be/src/"*/ | grep -v gutil | grep -v kudu | tr '\n' ' ') +DIRS=$(ls -d "${IMPALA_HOME}/be/src/"*/ | grep -v gutil | grep -v kudu |\ + grep -v thirdparty | tr '\n' ' ') +# Include/exclude select thirdparty dirs. +DIRS=$DIRS$(ls -d "${IMPALA_HOME}/be/src/thirdparty/"*/ | grep -v mpfit | tr '\n' ' ') PIPE_DIRS=$(echo "${DIRS}" | tr ' ' '|') # Reduce the concurrency to one less than the number of cores in the system. Note than diff --git a/fe/src/main/java/org/apache/impala/analysis/FunctionCallExpr.java b/fe/src/main/java/org/apache/impala/analysis/FunctionCallExpr.java index 9574a02ff3..12a34f72d2 100644 --- a/fe/src/main/java/org/apache/impala/analysis/FunctionCallExpr.java +++ b/fe/src/main/java/org/apache/impala/analysis/FunctionCallExpr.java @@ -469,8 +469,6 @@ protected void analyzeImpl(Analyzer analyzer) throws AnalysisException { return; } - Type[] argTypes = collectChildReturnTypes(); - // User needs DB access. Db db = analyzer.getDb(fnName_.getDb(), Privilege.VIEW_METADATA, true); if (!db.containsFunction(fnName_.getFunction())) { @@ -482,8 +480,7 @@ protected void analyzeImpl(Analyzer analyzer) throws AnalysisException { // There is no version of COUNT() that takes more than 1 argument but after // the rewrite, we only need count(*). // TODO: fix how we rewrite count distinct. - argTypes = new Type[0]; - Function searchDesc = new Function(fnName_, argTypes, Type.INVALID, false); + Function searchDesc = new Function(fnName_, new Type[0], Type.INVALID, false); fn_ = db.getFunction(searchDesc, Function.CompareMode.IS_NONSTRICT_SUPERTYPE_OF); type_ = fn_.getReturnType(); // Make sure BE doesn't see any TYPE_NULL exprs @@ -506,6 +503,28 @@ protected void analyzeImpl(Analyzer analyzer) throws AnalysisException { "AVG requires a numeric or timestamp parameter: " + toSql()); } + // SAMPLED_NDV() is only valid with two children. Invocations with an invalid number + // of children are gracefully handled when resolving the function signature. + if (fnName_.getFunction().equalsIgnoreCase("sampled_ndv") + && children_.size() == 2) { + if (!(children_.get(1) instanceof NumericLiteral)) { + throw new AnalysisException( + "Second parameter of SAMPLED_NDV() must be a numeric literal in [0,1]: " + + children_.get(1).toSql()); + } + NumericLiteral samplePerc = (NumericLiteral) children_.get(1); + if (samplePerc.getDoubleValue() < 0 || samplePerc.getDoubleValue() > 1.0) { + throw new AnalysisException( + "Second parameter of SAMPLED_NDV() must be a numeric literal in [0,1]: " + + samplePerc.toSql()); + } + // Numeric literals with a decimal point are analyzed as decimals. Without this + // cast we might resolve to the wrong function because there is no exactly + // matching signature with decimal as the second argument. + children_.set(1, samplePerc.uncheckedCastTo(Type.DOUBLE)); + } + + Type[] argTypes = collectChildReturnTypes(); Function searchDesc = new Function(fnName_, argTypes, Type.INVALID, false); fn_ = db.getFunction(searchDesc, Function.CompareMode.IS_NONSTRICT_SUPERTYPE_OF); if (fn_ == null || (!isInternalFnCall_ && !fn_.userVisible())) { diff --git a/fe/src/main/java/org/apache/impala/catalog/BuiltinsDb.java b/fe/src/main/java/org/apache/impala/catalog/BuiltinsDb.java index 07699d3e78..f3164100fd 100644 --- a/fe/src/main/java/org/apache/impala/catalog/BuiltinsDb.java +++ b/fe/src/main/java/org/apache/impala/catalog/BuiltinsDb.java @@ -22,7 +22,6 @@ import java.util.Map; import org.apache.hadoop.hive.metastore.api.Database; - import org.apache.impala.analysis.ArithmeticExpr; import org.apache.impala.analysis.BinaryPredicate; import org.apache.impala.analysis.CaseExpr; @@ -32,6 +31,7 @@ import org.apache.impala.analysis.IsNullPredicate; import org.apache.impala.analysis.LikePredicate; import org.apache.impala.builtins.ScalarBuiltins; + import com.google.common.collect.ImmutableMap; import com.google.common.collect.Lists; @@ -304,6 +304,30 @@ private static Database createMetastoreDb(String name) { "9HllUpdateIN10impala_udf10DecimalValEEEvPNS2_15FunctionContextERKT_PNS2_9StringValE") .build(); + private static final Map SAMPLED_NDV_UPDATE_SYMBOL = + ImmutableMap.builder() + .put(Type.BOOLEAN, + "16SampledNdvUpdateIN10impala_udf10BooleanValEEEvPNS2_15FunctionContextERKT_RKNS2_9DoubleValEPNS2_9StringValE") + .put(Type.TINYINT, + "16SampledNdvUpdateIN10impala_udf10TinyIntValEEEvPNS2_15FunctionContextERKT_RKNS2_9DoubleValEPNS2_9StringValE") + .put(Type.SMALLINT, + "16SampledNdvUpdateIN10impala_udf11SmallIntValEEEvPNS2_15FunctionContextERKT_RKNS2_9DoubleValEPNS2_9StringValE") + .put(Type.INT, + "16SampledNdvUpdateIN10impala_udf6IntValEEEvPNS2_15FunctionContextERKT_RKNS2_9DoubleValEPNS2_9StringValE") + .put(Type.BIGINT, + "16SampledNdvUpdateIN10impala_udf9BigIntValEEEvPNS2_15FunctionContextERKT_RKNS2_9DoubleValEPNS2_9StringValE") + .put(Type.FLOAT, + "16SampledNdvUpdateIN10impala_udf8FloatValEEEvPNS2_15FunctionContextERKT_RKNS2_9DoubleValEPNS2_9StringValE") + .put(Type.DOUBLE, + "16SampledNdvUpdateIN10impala_udf9DoubleValEEEvPNS2_15FunctionContextERKT_RKS3_PNS2_9StringValE") + .put(Type.STRING, + "16SampledNdvUpdateIN10impala_udf9StringValEEEvPNS2_15FunctionContextERKT_RKNS2_9DoubleValEPS3_") + .put(Type.TIMESTAMP, + "16SampledNdvUpdateIN10impala_udf12TimestampValEEEvPNS2_15FunctionContextERKT_RKNS2_9DoubleValEPNS2_9StringValE") + .put(Type.DECIMAL, + "16SampledNdvUpdateIN10impala_udf10DecimalValEEEvPNS2_15FunctionContextERKT_RKNS2_9DoubleValEPNS2_9StringValE") + .build(); + private static final Map PC_UPDATE_SYMBOL = ImmutableMap.builder() .put(Type.BOOLEAN, @@ -788,6 +812,19 @@ private void initAggregateBuiltins() { "_Z20IncrementNdvFinalizePN10impala_udf15FunctionContextERKNS_9StringValE", true, false, true)); + // SAMPLED_NDV. + // Size needs to be kept in sync with SampledNdvState in the BE. + int NUM_HLL_BUCKETS = 32; + int size = 16 + NUM_HLL_BUCKETS * (8 + HLL_INTERMEDIATE_SIZE); + Type sampledIntermediateType = ScalarType.createFixedUdaIntermediateType(size); + db.addBuiltin(AggregateFunction.createBuiltin(db, "sampled_ndv", + Lists.newArrayList(t, Type.DOUBLE), Type.BIGINT, sampledIntermediateType, + prefix + "14SampledNdvInitEPN10impala_udf15FunctionContextEPNS1_9StringValE", + prefix + SAMPLED_NDV_UPDATE_SYMBOL.get(t), + prefix + "15SampledNdvMergeEPN10impala_udf15FunctionContextERKNS1_9StringValEPS4_", + null, + prefix + "18SampledNdvFinalizeEPN10impala_udf15FunctionContextERKNS1_9StringValE", + true, false, true)); Type pcIntermediateType = ScalarType.createFixedUdaIntermediateType(PC_INTERMEDIATE_SIZE); diff --git a/fe/src/main/java/org/apache/impala/catalog/HdfsTable.java b/fe/src/main/java/org/apache/impala/catalog/HdfsTable.java index c2417f6a7e..4de25fe135 100644 --- a/fe/src/main/java/org/apache/impala/catalog/HdfsTable.java +++ b/fe/src/main/java/org/apache/impala/catalog/HdfsTable.java @@ -2132,7 +2132,6 @@ public Map> getFilesSample( parts[selectedIdx] = parts[numFilesRemaining - 1]; --numFilesRemaining; } - return result; } } diff --git a/fe/src/test/java/org/apache/impala/analysis/AnalyzeStmtsTest.java b/fe/src/test/java/org/apache/impala/analysis/AnalyzeStmtsTest.java index 54aa098d2b..133b6e2cbb 100644 --- a/fe/src/test/java/org/apache/impala/analysis/AnalyzeStmtsTest.java +++ b/fe/src/test/java/org/apache/impala/analysis/AnalyzeStmtsTest.java @@ -18,19 +18,23 @@ package org.apache.impala.analysis; import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; import static org.junit.Assert.fail; import java.lang.reflect.Field; import java.util.List; +import org.apache.impala.catalog.Column; import org.apache.impala.catalog.PrimitiveType; import org.apache.impala.catalog.ScalarType; +import org.apache.impala.catalog.Table; import org.apache.impala.catalog.Type; import org.apache.impala.common.AnalysisException; import org.apache.impala.common.RuntimeEnv; import org.junit.Assert; import org.junit.Test; +import com.google.common.base.Joiner; import com.google.common.base.Preconditions; import com.google.common.collect.ImmutableList; import com.google.common.collect.Lists; @@ -2050,6 +2054,55 @@ public void TestDistinct() throws AnalysisException { + "from functional.alltypesagg group by 1"); } + @Test + public void TestSampledNdv() throws AnalysisException { + Table allScalarTypes = addAllScalarTypesTestTable(); + String tblName = allScalarTypes.getFullName(); + + // Positive tests: Test all scalar types and valid sampling percents. + double validSamplePercs[] = new double[] { 0.0, 0.1, 0.2, 0.5, 0.8, 1.0 }; + for (double perc: validSamplePercs) { + List allAggFnCalls = Lists.newArrayList(); + for (Column col: allScalarTypes.getColumns()) { + String aggFnCall = String.format("sampled_ndv(%s, %s)", col.getName(), perc); + allAggFnCalls.add(aggFnCall); + String stmtSql = String.format("select %s from %s", aggFnCall, tblName); + SelectStmt stmt = (SelectStmt) AnalyzesOk(stmtSql); + // Verify that the resolved function signature matches as expected. + Type[] args = stmt.getAggInfo().getAggregateExprs().get(0).getFn().getArgs(); + assertEquals(args.length, 2); + assertTrue(col.getType().matchesType(args[0]) || + col.getType().isStringType() && args[0].equals(Type.STRING)); + assertEquals(Type.DOUBLE, args[1]); + } + // Test several calls in the same query block. + AnalyzesOk(String.format( + "select %s from %s", Joiner.on(",").join(allAggFnCalls), tblName)); + } + + // Negative tests: Incorrect number of args. + AnalysisError( + String.format("select sampled_ndv() from %s", tblName), + "No matching function with signature: sampled_ndv()."); + AnalysisError( + String.format("select sampled_ndv(int_col) from %s", tblName), + "No matching function with signature: sampled_ndv(INT)."); + AnalysisError( + String.format("select sampled_ndv(int_col, 0.1, 10) from %s", tblName), + "No matching function with signature: sampled_ndv(INT, DECIMAL(1,1), TINYINT)."); + + // Negative tests: Invalid sampling percent. + String invalidSamplePercs[] = new String[] { + "int_col", "double_col", "100 / 10", "-0.1", "1.1", "100", "50", "-50", "NULL" + }; + for (String invalidPerc: invalidSamplePercs) { + AnalysisError( + String.format("select sampled_ndv(int_col, %s) from %s", invalidPerc, tblName), + "Second parameter of SAMPLED_NDV() must be a numeric literal in [0,1]: " + + invalidPerc); + } + } + @Test public void TestGroupConcat() throws AnalysisException { // Test valid and invalid parameters diff --git a/fe/src/test/java/org/apache/impala/common/FrontendTestBase.java b/fe/src/test/java/org/apache/impala/common/FrontendTestBase.java index c014dffbfc..6718cb483d 100644 --- a/fe/src/test/java/org/apache/impala/common/FrontendTestBase.java +++ b/fe/src/test/java/org/apache/impala/common/FrontendTestBase.java @@ -58,8 +58,8 @@ import org.apache.impala.thrift.TQueryCtx; import org.apache.impala.thrift.TQueryOptions; import org.junit.After; -import org.junit.Assert; import org.junit.AfterClass; +import org.junit.Assert; import org.junit.BeforeClass; import com.google.common.base.Joiner; @@ -233,6 +233,16 @@ protected Table addTestView(String createViewSql) { return dummyView; } + protected Table addAllScalarTypesTestTable() { + addTestDb("allscalartypesdb", ""); + return addTestTable("create table allscalartypes (" + + "bool_col boolean, tinyint_col tinyint, smallint_col smallint, int_col int, " + + "bigint_col bigint, float_col float, double_col double, dec1 decimal(9,0), " + + "d2 decimal(10, 0), d3 decimal(20, 10), d4 decimal(38, 38), d5 decimal(10, 5), " + + "timestamp_col timestamp, string_col string, varchar_col varchar(50), " + + "char_col char (30))"); + } + protected void clearTestTables() { for (Table testTable: testTables_) { testTable.getDb().removeTable(testTable.getName()); diff --git a/tests/query_test/test_aggregation.py b/tests/query_test/test_aggregation.py index 9e0be6d4ba..233c33a561 100644 --- a/tests/query_test/test_aggregation.py +++ b/tests/query_test/test_aggregation.py @@ -275,6 +275,75 @@ def test_parquet_count_star_optimization(self, vector, unique_database): vector.get_value('exec_option')['batch_size'] = 1 self.run_test_case('QueryTest/parquet-stats-agg', vector, unique_database) + def test_sampled_ndv(self, vector, unique_database): + """The SAMPLED_NDV() function is inherently non-deterministic and cannot be + reasonably made deterministic with existing options so we test it separately. + The goal of this test is to ensure that SAMPLED_NDV() works on all data types + and returns approximately sensible estimates. It is not the goal of this test + to ensure tight error bounds on the NDV estimates. SAMPLED_NDV() is expected + be inaccurate on small data sets like the ones we use in this test.""" + if (vector.get_value('table_format').file_format != 'text' or + vector.get_value('table_format').compression_codec != 'none'): + # No need to run this test on all file formats + pytest.skip() + + # NDV() is used a baseline to compare SAMPLED_NDV(). Both NDV() and SAMPLED_NDV() + # are based on HyperLogLog so NDV() is roughly the best that SAMPLED_NDV() can do. + # Expectations: All columns except 'id' and 'timestmap_col' have low NDVs and are + # expected to be reasonably accurate with SAMPLED_NDV(). For the two high-NDV columns + # SAMPLED_NDV() is expected to have high variance and error. + ndv_stmt = """ + select ndv(bool_col), ndv(tinyint_col), + ndv(smallint_col), ndv(int_col), + ndv(bigint_col), ndv(float_col), + ndv(double_col), ndv(string_col), + ndv(cast(double_col as decimal(3, 0))), + ndv(cast(double_col as decimal(10, 5))), + ndv(cast(double_col as decimal(20, 10))), + ndv(cast(double_col as decimal(38, 35))), + ndv(cast(string_col as varchar(20))), + ndv(cast(string_col as char(10))), + ndv(timestamp_col), ndv(id) + from functional_parquet.alltypesagg""" + ndv_result = self.execute_query(ndv_stmt) + ndv_vals = ndv_result.data[0].split('\t') + + for sample_perc in [0.1, 0.2, 0.5, 1.0]: + sampled_ndv_stmt = """ + select sampled_ndv(bool_col, {0}), sampled_ndv(tinyint_col, {0}), + sampled_ndv(smallint_col, {0}), sampled_ndv(int_col, {0}), + sampled_ndv(bigint_col, {0}), sampled_ndv(float_col, {0}), + sampled_ndv(double_col, {0}), sampled_ndv(string_col, {0}), + sampled_ndv(cast(double_col as decimal(3, 0)), {0}), + sampled_ndv(cast(double_col as decimal(10, 5)), {0}), + sampled_ndv(cast(double_col as decimal(20, 10)), {0}), + sampled_ndv(cast(double_col as decimal(38, 35)), {0}), + sampled_ndv(cast(string_col as varchar(20)), {0}), + sampled_ndv(cast(string_col as char(10)), {0}), + sampled_ndv(timestamp_col, {0}), sampled_ndv(id, {0}) + from functional_parquet.alltypesagg""".format(sample_perc) + sampled_ndv_result = self.execute_query(sampled_ndv_stmt) + sampled_ndv_vals = sampled_ndv_result.data[0].split('\t') + + assert len(sampled_ndv_vals) == len(ndv_vals) + # Low NDV columns. We expect a reasonaby accurate estimate regardless of the + # sampling percent. + for i in xrange(0, 14): + self.__appx_equals(int(sampled_ndv_vals[i]), int(ndv_vals[i]), 0.1) + # High NDV columns. We expect the estimate to have high variance and error. + # Since we give NDV() and SAMPLED_NDV() the same input data, i.e., we are not + # actually sampling for SAMPLED_NDV(), we expect the result of SAMPLED_NDV() to + # be bigger than NDV() proportional to the sampling percent. + # For example, the column 'id' is a PK so we expect the result of SAMPLED_NDV() + # with a sampling percent of 0.1 to be approximately 10x of the NDV(). + for i in xrange(14, 16): + self.__appx_equals(int(sampled_ndv_vals[i]) * sample_perc, int(ndv_vals[i]), 2.0) + + def __appx_equals(self, a, b, diff_perc): + """Returns True if 'a' and 'b' are within 'diff_perc' percent of each other, + False otherwise. 'diff_perc' must be a float in [0,1].""" + assert abs(a - b) / float(max(a, b)) <= diff_perc + class TestWideAggregationQueries(ImpalaTestSuite): """Test that aggregations with many grouping columns work""" @classmethod