Makefile | 32 ++++- debug_c.c | 51 ++++--- indices.c | 87 ----------- indices.h | 103 ------------- indices.in | 108 ++++++++++++++ itref.h | 185 ------------------------ lsq_indices.h | 106 -------------- lsq_itref.h | 3 +- main.c | 6 +- make-indices | 118 +++++++++++++++ make.dep | 18 --- make.ia64-mkl8 | 5 +- make.inc-ejr-home | 32 ++--- readdata.c | 20 ++-- sgels_rfsx.f | 36 +++--- sgels_x.f | 70 +--------- sgesvxx/debug.fh | 3 + sgesvxx/make.inc-ejr-home | 80 +++++------ sgesvxx/make.x86-ejr.gfortran | 3 +- sgesvxx/timer.c | 49 ++++++- slsq_driver.c | 69 ++------- slsq_get_truth.c | 2 + slsq_itref_test.c | 321 ++++++++++++++++++++++++----------------- 23 files changed, 628 insertions(+), 879 deletions(-) diff --git a/Makefile b/Makefile index 3e6371c..a0bfa40 100644 --- a/Makefile +++ b/Makefile @@ -12,22 +12,35 @@ DRIVER_OBJS= main.o slsq_driver.o slsq_get_truth.o slsq_itref_test.o\ output.o indices.o timer.o NEW_BLAS = BLAS_sgemv_sub_v_x.o BLAS_sgemv2_sub_v2_x.o +LINITREF = sgesvxx +ITREF_LIBS= $(LINITREF)/lin_itref.a $(LINITREF)/lin_itref_test.a \ + $(LINITREF)/libdebug.a $(LINITREF)/liblapackcompat.a + OBJS=$(NEW_BLAS) $(ITREF_OBJS) $(DRIVER_OBJS) BIN=driver -all: config.h $(BIN) +all: indices-gen sgesvxx/config.h config.h $(BIN) + +sgesvxx/config.h: + make -C sgesvxx config.h + +.PHONY: sgesvxx-libs +sgesvxx-libs: + make -C sgesvxx lib + +$(ITREF_LIBS): sgesvxx-libs matlab_header: matlab_header.o output.o indices.o $(LD) -o $@ $(LDFLAGS) matlab_header.o output.o indices.o -include make.dep +-include make.dep dep: gcc -MM -E *.c > make.dep grep -i include *.f|sort|uniq|perl -e 'while (<>) { @ln = m/^(\S+)\.f:\s+include\s+\S(\S+\.fh)/i; if ($$#ln >= 1) { if (!exists $$deps{$$1}) { $$deps{$$1} = [];} push @{$$deps{$$1}}, $$2;} } foreach $$k (keys %deps) { print $$k, ".o:\t", $$k, ".f ", join(" ", @{$$deps{$$k}}), "\n";}' >> make.dep -driver: $(OBJS) - $(LD) -o $@ $(LDFLAGS) $(OBJS) $(LIB_DIR) $(LIBS) +driver: $(OBJS) $(ITREF_LIBS) + $(LD) -o $@ $(LDFLAGS) $(OBJS) $(ITREF_LIBS) $(LIB_DIR) $(LIBS) config.h: config.h.in configure CC="$(CC)" F77="$(F77)" CPPFLAGS="$(INC_DIR)" FFLAGS="$(FFLAGS)" \ @@ -39,6 +52,16 @@ configure: configure.ac config.h.in: configure.ac autoheader +.PHONY: indices-gen +indices-gen: + @perl -w ./make-indices + @cmp indices.c.tmp indices.c >/dev/null 2>/dev/null || mv indices.c.tmp indices.c + @cmp lsq_indices.h.tmp lsq_indices.h >/dev/null 2>/dev/null || mv lsq_indices.h.tmp lsq_indices.h + @rm -f indices.c.tmp lsq_indices.h.tmp + +indices.c: indices-gen +lsq_indices.h: indices-gen + %.o: %.c $(CC) $(CFLAGS) $(INC_DIR) -c $< @@ -50,6 +73,7 @@ archive: clean -zcf ../`basename \`pwd\``-`date +%m-%d-%y`.tar.gz `basename \`pwd\`` clean: + make -C sgesvxx clean rm -f $(BIN) rm -f *.o rm -f *~ diff --git a/debug_c.c b/debug_c.c index 5e3e107..1741ef6 100644 --- a/debug_c.c +++ b/debug_c.c @@ -54,7 +54,7 @@ F77_FUNC_(debug_scheme_out,DEBUG_SCHEME_OUT) (void) } void -F77_FUNC_(debug_rcond,DEBUG_RCOND_PIV)(const float* rcond) +F77_FUNC_(debug_rcond,DEBUG_RCOND)(const float* rcond) { dbg->debug_data[SRCOND_A_I] = *rcond; if (dbg->debug_data[VERBOSE_I]) { @@ -62,6 +62,37 @@ F77_FUNC_(debug_rcond,DEBUG_RCOND_PIV)(const float* rcond) } } +void +F77_FUNC_(debug_alpha,DEBUG_ALPHA)(const float* alpha) +{ + dbg->debug_data[SCLALPHA_I] = *alpha; +} + +int +F77_FUNC_(debug_wrtr,DEBUG_WRTR) (void) +{ + return (int) dbg->debug_data[WRTR_I]; +} + +void +F77_FUNC_(debug_save_ws,DEBUG_SAVE_WS)(const float *w0, const float *w00, + const float *w1, const float *w2, + const float *w3, const float *w4, + const float *w5, const float *w6, + const float *w7, const float *w8) +{ + dbg->debug_data[W0_I] = *w0; + dbg->debug_data[W00_I] = *w00; + dbg->debug_data[W1_I] = *w1; + dbg->debug_data[W2_I] = *w2; + dbg->debug_data[W3_I] = *w3; + dbg->debug_data[W4_I] = *w4; + dbg->debug_data[W5_I] = *w5; + dbg->debug_data[W6_I] = *w6; + dbg->debug_data[W7_I] = *w7; + dbg->debug_data[W8_I] = *w8; +} + #if defined(CLOCK_HIGHRES) #define SAMPLED_CLOCK CLOCK_HIGHRES #else @@ -210,7 +241,6 @@ F77_FUNC_(debug_trace,DEBUG_TRACE) debug_data[DX_I] = -1.0; debug_data[PREVDZ_Z_X_I] = -1.0; debug_data[DZ_Z_X_I] = -1.0; - debug_data[PREVX_I] = -1.0; if (verbose) printf ("SGELS_X, rhs %d\n", j); time_start = ts; } @@ -232,8 +262,6 @@ F77_FUNC_(debug_trace,DEBUG_TRACE) if (1 <= x_state) { debug_data[PREVDX_I] = debug_data[DX_I]; debug_data[DX_I] = absdx; - debug_data[PREVX_I] = debug_data[X_I]; - debug_data[X_I] = absx; } if (1 <= zx_state) { debug_data[PREVDZ_Z_X_I] = debug_data[DZ_Z_X_I]; @@ -306,14 +334,6 @@ F77_FUNC_(debug_trace,DEBUG_TRACE) fprintf (stderr, "Unknown scheme: %d\n", scheme); break; } - if (scheme == -1) { /* Single precision */ - double absx = fabs( x[0] ); - for (i = 1; i < n; ++i) { - double tmp = x[i]; - if (tmp > absx) absx = tmp; - } - debug_data[X_I] = absx; - } debug_data[XSTATE_I] = x_state; debug_data[RSTATE_I] = r_state; debug_data[ZX_STATE_I] = zx_state; @@ -524,10 +544,3 @@ void F77_FUNC_(debug_itref_time, DEBUG_ITREF_TIME) (int* stopflag) { else dbg->debug_data[ITREF_TIME_I] = toc(&tv); } -void F77_FUNC_(debug_normb, DEBUG_NORM_B)(const float *normb) { - dbg->debug_data[NORMB_I] = *normb; -} - -void F77_FUNC_(debug_normr, DEBUG_NORM_R)(const float *normr) { - dbg->debug_data[NORMR_I] = *normr; -} diff --git a/indices.c b/indices.c deleted file mode 100644 index 80c372f..0000000 --- a/indices.c +++ /dev/null @@ -1,87 +0,0 @@ -/* Copyright (c) 2005, Regents of the University of California - See COPYING file for license. */ -#include "lsq_indices.h" - -const char *lsq_data_names[OUTPUT_DATA_SIZE] = { - "test_no", - "info", - "aug_rcond", - "scheme", - "x_state", - "r_state", - "zx_state", - "zr_state", - "dx", - "dz_z_x", - "prev_dx", - "prev_dz_z_x", - "x_bnd", - "x_err", - "r_bnd", - "r_err", - "zx_bnd", - "zx_err", - "zr_bnd", - "zr_err", - "K_norm_x", - "K_comp_x", - "K_norm_r", - "K_comp_r", - "x_berr", - "r_berr", - "srcond_A", - "alpha", - "cos_alpha", - "dgesvxx_drcond", - "dbl_res_count", - "dbl_x_count", - "dx_rmax", - "dz_rmax", - "seed0", - "seed1", - "seed2", - "seed3", - "test_type", - "d_xbnd", - "d_zbnd", - "d_berr", - "first_dx", - "first_dz_z_x", - "prev_x", - "x", - "sngl_res_count", - "time", - "xmin", - "maxtdiff", - "maxtcdiff", - "maxtres", - "residual", - "final_dx_x", - "final_dz_z_x", - "xtmin", - "eqres", - "ymin", - "yerr", - "y", - "xt", - "worst_x", - "worst_c", - "xstopcnt", - "zstopcnt", - "xmode", - "xcond", - "ytcond", - "flags", - "initzerr", - "initxerr", - "ferr_x", - "ferr_r", - "x_rcond", - "lsq_sin", - "total_time", - "itref_time", - "m", - "n", - "normb", - "normr" -}; diff --git a/indices.h b/indices.h deleted file mode 100644 index 9548b14..0000000 --- a/indices.h +++ /dev/null @@ -1,103 +0,0 @@ -/* Copyright (c) 2005, Regents of the University of California - See COPYING file for license. */ -#ifndef _INDICES_H_ -#define _INDICES_H_ - -#define OUTPUT_DATA_SIZE 79 /* number of variables to output */ -#define DEBUG_DATA_SIZE 90 /* total number of items in debug_data array */ -#define MAX_N 5000 /* maximum dimension */ -#define MAX_M 5000 /* maximum dimension */ - -#define TESTNO_I 0 /* Test number */ -#define INFO_I 1 /* Info returned by SGELS_X */ -#define AUG_RCOND_I 2 /* Reciprocal condition number of augmented system, - obtained from QR factorization. */ -#define SCHEME_I 3 /* Scheme used. (old = 1, new = 2) */ -#define XSTATE_I 4 /* Status of x vector at the end of iteration */ -#define RSTATE_I 5 /* Status of r vector at the end of iteration */ -#define ZX_STATE_I 6 /* Status of componentwise-x vector */ -#define ZR_STATE_I 7 /* Status of componentwise-r vector */ -#define DX_I 8 /* Last ||dx|| */ -#define DZ_Z_X_I 9 /* Last |dx|/|x|, componentwise */ -#define PREVDX_I 10 /* Next-to-last |dx|/|x| */ -#define PREVDZ_Z_X_I 11 /* Next-to-last |dz|/|z| */ -#define XBND_I 12 /* Relative normwise error bound on x */ -#define XERR_I 13 /* Relative x error */ -#define RBND_I 14 /* Relative normwise error bound on r */ -#define RERR_I 15 /* Relative residual error, comparerd to b */ -#define ZX_BND_I 16 /* Relative componentwise error bound on x */ -#define ZX_ERR_I 17 /* Relative componentwise error for x */ -#define ZR_BND_I 18 /* Relative componentwise error bound on r */ -#define ZR_ERR_I 19 /* Relative componentwise error for r */ -#define XCOND_NORM 20 /* Normwise condition number w.r.t. x */ -#define XCOND_COMP 21 /* Componentwise condition number w.r.t. x */ -#define RCOND_NORM 22 /* Normwise condition number w.r.t. r */ -#define RCOND_COMP 23 /* Componentwise condition number w.r.t. r */ -#define X_BERR_I 24 /* Backward error */ -#define R_BERR_I 25 /* Backward error (these two are the same) */ -#define SRCOND_A_I 26 /* Single precision rcond for matrix A */ -#define ALPHA_I 27 /* Angle between r and b */ -#define COS_ALPHA_I 28 /* Cosine of the angle between r and b */ -#define DRCOND_I 29 /* Double precision rcond of equilibrated, - augmented system, obtained from DGESVXX */ -#define DBL_RES_COUNT_I 30 /* Number of iterations in old scheme */ -#define DBL_X_COUNT_I 31 /* Number of iterations in new scheme */ -#define DXRMAX_I 32 /* Max ratio of consecutive |dx| */ -#define DZRMAX_I 33 /* Max ratio of consecutive |dz| */ -#define SEED0_I 34 /* Seed use to generate this test case */ -#define SEED1_I 35 /* Seed use to generate this test case */ -#define SEED2_I 36 /* Seed use to generate this test case */ -#define SEED3_I 37 /* Seed use to generate this test case */ -#define TEST_TYPE_I 38 /* Test matrix type, small_piv * 10 + dist_mode */ -#define D_XBND_I 39 /* Normwise error bound, double precision */ -#define D_ZBND_I 40 /* Componentwise error bound, double precision */ -#define D_BERR_I 41 /* Backward error bound, double precision */ -#define FIRST_DX_I 42 /* First dx / x value */ -#define FIRST_DZ_Z_X_I 43 /* First |dz|/|z| value (after stabilized) */ -#define PREVX_I 44 /* Norm of previous x. */ -#define X_I 45 /* Norm of x. */ -#define SNGL_RES_COUNT_I 46 /* Count of b-Ax in single */ -#define TIME_I 47 /* total time */ -#define XMIN_I 48 /* min magnitude x entry */ -#define MAXTDIFF_I 49 /* largest abs(x[i] - x_truth[i]) */ -#define MAXTCDIFF_I 50 /* largest abs((x[i]-x_truth[i])/x_truth[i])*/ -#define MAXTRES_I 51 /* largest abs(b_dble - a_dble * x_truth) */ -#define RES_I 52 /* Largest residual */ -#define FINAL_DX_I 53 /* ||dx|| used in xbnd computation */ -#define FINAL_DZ_Z_X_I 54 /* dz_z used in zbnd computation */ -#define XTMIN_I 55 /* min abs. entry in x_truth */ -#define EQRES_I 56 /* residual for equilibrated system */ -#define YMIN_I 57 /* min abs. entry in equil. x */ -#define YERR_I 58 /* error in equil. x */ -#define Y_I 59 /* norm(y, inf) */ -#define XT_I 60 /* norm(x_truth, inf) */ -#define WORST_X_I 61 /* component with worst relative error */ -#define WORST_C_I 62 /* column scaling c[k] for worst component */ -#define XSTOPCNT_I 63 /* count where X stops */ -#define ZSTOPCNT_I 64 /* count where Z stops */ -#define RANDX_I 65 /* X generation method */ -#define XCOND_I 66 /* Target X condition number */ -#define YTCOND_I 67 /* True y_max / y_min number */ -#define FLAGS_I 68 /* FP flags for refinement */ -#define INITZERR_I 69 /* Initial componentwise error */ -#define INITXERR_I 70 /* Initial normwise error */ -#define FERR_X_I 71 /* Loose forward error bound for x */ -#define FERR_R_I 72 /* Loose forward error bound for r */ -#define X_RCOND_I 73 /* Higham's condition number of X */ -#define LSQ_SIN_I 74 /* Computed sin of the angle between b and r */ -#define TOTAL_TIME_I 75 /* Total time of execution */ -#define ITREF_TIME_I 76 /* Time spent in iterative refinement */ -#define ROWDIM_I 77 /* Row dimension */ -#define COLDIM_I 78 /* Column dimension */ - -#define RATIO_UB_I 86 /* rthresh bound to be used */ -#define ITMAX_I 87 /* iteration maximum */ -#define ADD_LAST_DX_I 88 /* whether to add the last dx */ -#define VERBOSE_I 89 /* verbosity level */ -#define DZ_UB_I 90 -#define SCHEME_IN_I 91 -#define SCHEME_OUT_I 92 - -extern const char *lsq_data_names []; - -#endif diff --git a/indices.in b/indices.in new file mode 100644 index 0000000..dd80c2b --- /dev/null +++ b/indices.in @@ -0,0 +1,108 @@ +#output +TESTNO_I "Test number" +INFO_I "Info returned by SGELS_X" +SEED0_I "Generator seed[0]" +SEED1_I "Generator seed[1]" +SEED2_I "Generator seed[2]" +SEED3_I "Generator seed[3]" +XCOND_NORM_I "Normwise condition number (numerator)" +XCOND_COMP_I "Componentwise condition number w.r.t. x" +RCOND_NORM_I "Normwise condition number (numerator)" +RCOND_COMP_I "Componentwise condition number w.r.t. r" +ALPHA_I "Angle between r and b" +AUG_RCOND_I "rcond of augmented system, from QR" +BT_AX_I "B'*(A*X)" +BT_AXTRUE_I "B'*(A*Xtrue)" +CCONDR_CHEAP_I "cheap condition estimate for x" +COLDIM_I "Column dimension" +COS_ALPHA_I "Cosine of the angle between r and b" +D_BERR_I "Backward error bound, double precision" +DBL_RES_COUNT_I "Number of iterations in old scheme" +DBL_X_COUNT_I "Number of iterations in new scheme" +DRCOND_I "rcond of augmented system reported by dgesvxx" +D_XBND_I "Normwise error bound, double precision" +DX_I "Last ||dx||" +DXRMAX_I "Max ratio of consecutive |dx|" +D_ZBND_I "Componentwise error bound, double precision" +DZRMAX_I "Max ratio of consecutive |dz|" +DZ_Z_X_I "Last |dx|/|x|, componentwise" +FERR_R_I "Loose forward error bound for r" +FERR_X_I "Loose forward error bound for x" +FINAL_DX_I "||dx|| used in xbnd computation" +FINAL_DZ_Z_X_I "dz_z used in zbnd computation" +FIRST_DX_I "First dx / x value" +FIRST_DZ_Z_X_I "First |dz|/|z| value (after stabilized)" +FLAGS_I "FP flags for refinement" +INITXERR_I "Initial normwise error" +INITZERR_I "Initial componentwise error" +ITREF_TIME_I "Time spent in iterative refinement" +LSQ_SIN_I "Computed sin of the angle between b and r" +NCONDR_CHEAP_I "cheap condition estimate for r" +NORM_B_I "norm(B, inf)" +NORM2_B_I "norm(B, 2)**2" +NORM_R_I "norm(R, inf)" +NORM2_R_I "norm(R, 2)**2" +NORM_RTRUE_I "norm(Rtrue, inf)" +NORM2_RTRUE_I "norm(Rtrue, 2)**2" +NORM_X_I "norm(X, inf)" +NORM2_X_I "norm(X, 2)**2" +NORM_XTRUE_I "norm(Xtrue, inf)" +NORM2_XTRUE_I "norm(Xtrue, 2)**2" +NORM2_AX_I "norm(A*X, 2)**2" +NORM2_AXTRUE_I "norm(A*Xtrue, 2)**2" +NORM_ATR_I "norm(A'*R, inf)" +NORM_ATRTRUE_I "norm(A'*Rtrue, inf)" +PREVDX_I "Next-to-last |dx|/|x|" +PREVDZ_Z_X_I "Next-to-last |dz|/|z|" +RANDX_I "X generation method" +R_BERR_I "Backward error (these two are the same)" +RBND_I "Relative normwise error bound on r" +RERR_I "Relative residual error, compared to r" +RERRB_I "Relative residual error, compared to b" +ROWDIM_I "Row dimension" +RSTATE_I "Status of r vector at the end of iteration" +RT_AX_I "R'*(A*X)" +RTMIN_I "min abs. entry in r_truth" +RTRUET_AXTRUE_I "Rtrue'*(A*Xtrue)" +SCHEME_I "Scheme used. (old = 1, new = 2)" +SCLALPHA_I "alpha used in internal scaling" +SNGL_RES_COUNT_I "Count of b-Ax in single" +SRCOND_A_I "Single precision rcond for matrix A" +TEST_TYPE_I "Test matrix type, small_piv * 10 + dist_mode" +TIME_I "total time" +TOTAL_TIME_I "Total time of execution" +X_BERR_I "Backward error" +XBND_I "Relative normwise error bound on x" +XCOND_I "Target X condition number" +XERR_I "Relative x error" +X_RCOND_I "Higham's condition number of X" +XSTATE_I "Status of x vector at the end of iteration" +XSTOPCNT_I "count where X stops" +XTMIN_I "min abs. entry in x_truth" +ZR_BND_I "Relative componentwise error bound on r" +ZR_ERR_I "Relative componentwise error for r" +ZR_STATE_I "Status of componentwise-r vector" +ZSTOPCNT_I "count where Z stops" +ZX_BND_I "Relative componentwise error bound on x" +ZX_ERR_I "Relative componentwise error for x" +ZX_STATE_I "Status of componentwise-x vector" +W00_I +W0_I +W1_I +W2_I +W3_I +W4_I +W5_I +W6_I +W7_I +W8_I + +#input +RATIO_UB_I "rthresh bound to be used" +ITMAX_I "iteration maximum" +ADD_LAST_DX_I "whether to add the last dx" +VERBOSE_I "verbosity level" +DZ_UB_I "stability threshold" +SCHEME_IN_I "starting precision" +SCHEME_OUT_I "highest precision" +WRTR_I "measure dR relative to R rather than B" \ No newline at end of file diff --git a/itref.h b/itref.h deleted file mode 100644 index 8b2ab6f..0000000 --- a/itref.h +++ /dev/null @@ -1,185 +0,0 @@ -/* Copyright (c) 2005, Regents of the University of California - See COPYING file for license. */ -#ifndef _ITREF_H_ -#define _ITREF_H_ - -#include -#include -#include "indices.h" - -#define MALLOC(type, n) ((type *) malloc(sizeof(type) * (n))) - -#ifndef MAX -#define MAX(x, y) ((x) > (y) ? (x) : (y)) -#endif -#ifndef MIN -#define MIN(x, y) ((x) < (y) ? (x) : (y)) -#endif - -enum output_format { - output_verbose = 1, /* Verbose output. */ - output_matrix = 2, /* Put it into a matrix, with one row per test case. */ - output_matlab = 3, /* Matlab binary format (version 5). */ - output_raw = 4 /* Raw sequence of floats. */ -}; - -enum itref_test_method { - test_slatms = 1, /* Use LAPACK's slatms test generator. */ - test_GHJ = 2, /* Use [G*H, J] + d*I to generate low rank matrices - with small k-th pivot. */ - test_file = 3, /* Use hand-picked matrices read from file. */ - test_xblas = 4, /* Use xblas test generator. */ - test_pivmatrix = 5 -}; - -/* - See indices.f: - common /debugvars/ debug_data - REAL DEBUG_DATA(NDEBUGVARS == 31) - Note that this array includes both output variables and - option variables that the fortran code needs. -*/ -extern struct debugvars_common { - double x_truth[MAX_N]; - double r_truth[MAX_M]; - float debug_data[DEBUG_DATA_SIZE]; - int use_eq; -} *dbg; - -struct option_type { - int dim; /* dimensions */ - int m; /* row dimension */ - int n; /* column dimension */ - int nr_iter; /* number of tests */ - int use_eq; /* use equilibration */ - int pre_eq; /* equilibrate before running refinement */ - int scheme_in, scheme_out; /* scheme to use */ - int verbose; /* verbosity level. -1 is quiet, 0 is default. */ - int seed[4]; /* seed */ - char *fname; /* input file name */ - FILE *output_file; - int prec; /* precision to be used. */ - enum output_format format; - enum itref_test_method method; - int mode; - float cond; - float ratio_ub; /* rmax bound */ - int small_pivot; /* makes n-th pivot small. */ - int rank_def; /* rank defficiency */ - double eta; - double eps1, eps2; /* scalings */ - int itmax; - int add_last_dx; - float rthresh; - float dz_ub; - int rand_b; - int xmode; - double xcond; - int rand_scaling; - int rand_rowscl; - enum blas_trans_type trans; -}; - -/* Utility functions */ -void dprint_vector(double *x, int n, char *name); -void sprint_vector(float *x, int n, char *name); -void dprint_matrix(double *a, int m, int n, char *name); -void sprint_matrix(float *a, int m, int n, char *name); - -/* Data generation routines */ -void randdata(int n, int d, int k, - float *a, float eta, int *iseed); -void randdata2(int m, int n, int r, int mode, float cond, - float *a, int *iseed); -int readdata(const char *fname, int *n, float **a, float **b, float **x); -void piv_matrix(int n, int *seed, float *a, float *b); - -/* Internal routines */ -int slsq_get_truth(struct option_type *options, int m, int n, float alpha, - double *a, double *b, double *r_truth, double *x_truth, - double *rcond, double *ferr, double* rcond_nrm, - double *ferr_cmp, double* rcond_cmp, double *berr); -void slsq_itref_test(int m, int n, float *a, float *b, double *r_truth, - double *x_truth, struct option_type *options); -void output_init(FILE *f, enum output_format format, int nr_data); -void output_done(FILE *f, enum output_format format); -void output_data(FILE *f, const float *data, enum output_format format); -void matlab_write_matrix_header(FILE *f, int rows, int cols); -void matlab_write_header(FILE *f); -void slsq_driver(struct option_type *options); - -/* Wrappers to Fortran routines */ -int c_dgesvxx(char fact, char trans, int n, int nrhs, - double *a, int lda, double *af, int ldaf, int *ipiv, char *equed, - double *r, double *c, double *b, int ldb, double *x, int ldx, - double *rcond, double *ferr, double *rcond_nrm, - double *ferr_cmp, double *rcond_cmp, double *berr, - int nparams, double* params); - -int c_sgesvxx(char fact, char trans, int n, int nrhs, - float *a, int lda, float *af, int ldaf, int *ipiv, char *equed, - float *r, float *c, float *b, int ldb, float *x, int ldx, - float *rcond, float *ferr, float *rcond_nrm, - float *ferr_cmp, float *rcond_cmp, float *berr, - int nparams, float* params); - -int -c_sgels_x(char* trans, int m, int n, int nrhs, float *a, int lda, - float *af, int ldaf, float *b, int ldb, float *x, int ldx, - float *r, int ldr, int nparams, float* params, float* errbnd); - -/* Returns a random int between 0 and 4095 */ -int ilaran(int *iseed); - -double c_dlaran(int *iseed); -double c_dlarnd(int dist, int *iseed); -float c_slaran(int *iseed); -float c_slarnd(int dist, int *iseed); -void c_slarnv (int idist, int* iseed, int n, float* x); - -int c_slagge2(int m, int n, int side, float *a, int lda, int *iseed); -int c_slatm1(int mode, float cond, int irsign, int idist, - int *iseed, float *d, int n); - -void c_dgemm(char transa, char transb, int m, int n, int k, - double alpha, double *a, int lda, double *b, int ldb, - double beta, double *c, int ldc); -void c_sgemm(char transa, char transb, int m, int n, int k, - float alpha, float *a, int lda, float *b, int ldb, - float beta, float *c, int ldc); -int c_slatms(int m, int n, char dist, int *iseed, char sym, - float *d, int mode, float cond, float dmax, int kl, int ku, - char pack, float *a, int lda); - -int c_sgeequb(int m, int n, float* a, int lda, float* r, float* c, - float* rowcnd, float* colcnd, float* amax); -void c_slascl2(int m, int n, const float* d, float* x); -void c_slaqge (int m, int n, float* a, int lda, float* r, float* c, - float* rowcnd, float* colcnd, float* amax, char *equed); - -int c_ila_equed (char equed); - -float c_snrm2(int n, float *x, int incx); - -int c_sormqr(char side, char trans, int m, int nrhs, int n, float *a, int lda, - float *tau, float *c, int ldc, float *work, int lwork); -int c_sgeqrf(int m, int n, float *a, int lda, float *tau, float *work, - int lwork); -int c_strcon(char norm, char uplo, char diag, int n, float *a, int lda, - float *rcond, float *work); -float c_slange(char norm, int m, int n, float *a, int lda, float *work); - -void BLAS_sgemv2_sub_v2_x(enum blas_order_type order, - enum blas_trans_type trans, int m, int n, - float alpha, const float *a, int lda, - const float *head_x, const float *tail_x, int incx, - const float *head_z, const float *tail_z, int incz, - float beta, float *y, int incy, - enum blas_prec_type prec); -void BLAS_sgemv_sub_v_x(enum blas_order_type order, - enum blas_trans_type trans, int m, int n, float alpha, - const float *a, int lda, const float *x, int incx, - const float *z, int incz, float beta, float *y, - int incy, enum blas_prec_type prec); -#endif - diff --git a/lsq_indices.h b/lsq_indices.h deleted file mode 100644 index 6c39c40..0000000 --- a/lsq_indices.h +++ /dev/null @@ -1,106 +0,0 @@ -/* Copyright (c) 2005, Regents of the University of California - See COPYING file for license. */ -#ifndef _INDICES_H_ -#define _INDICES_H_ - -#define OUTPUT_DATA_SIZE 81 /* number of variables to output */ -#define DEBUG_DATA_SIZE 90 /* total number of items in debug_data array */ - -#define MAX_N 5000 /* maximum dimension */ -#define MAX_M 5000 /* maximum dimension */ - -#define TESTNO_I 0 /* Test number */ -#define INFO_I 1 /* Info returned by SGELS_X */ -#define AUG_RCOND_I 2 /* Reciprocal condition number of augmented system, - obtained from QR factorization. */ -#define SCHEME_I 3 /* Scheme used. (old = 1, new = 2) */ -#define XSTATE_I 4 /* Status of x vector at the end of iteration */ -#define RSTATE_I 5 /* Status of r vector at the end of iteration */ -#define ZX_STATE_I 6 /* Status of componentwise-x vector */ -#define ZR_STATE_I 7 /* Status of componentwise-r vector */ -#define DX_I 8 /* Last ||dx|| */ -#define DZ_Z_X_I 9 /* Last |dx|/|x|, componentwise */ -#define PREVDX_I 10 /* Next-to-last |dx|/|x| */ -#define PREVDZ_Z_X_I 11 /* Next-to-last |dz|/|z| */ -#define XBND_I 12 /* Relative normwise error bound on x */ -#define XERR_I 13 /* Relative x error */ -#define RBND_I 14 /* Relative normwise error bound on r */ -#define RERR_I 15 /* Relative residual error, comparerd to b */ -#define ZX_BND_I 16 /* Relative componentwise error bound on x */ -#define ZX_ERR_I 17 /* Relative componentwise error for x */ -#define ZR_BND_I 18 /* Relative componentwise error bound on r */ -#define ZR_ERR_I 19 /* Relative componentwise error for r */ -#define XCOND_NORM 20 /* Normwise condition number w.r.t. x */ -#define XCOND_COMP 21 /* Componentwise condition number w.r.t. x */ -#define RCOND_NORM 22 /* Normwise condition number w.r.t. r */ -#define RCOND_COMP 23 /* Componentwise condition number w.r.t. r */ -#define X_BERR_I 24 /* Backward error */ -#define R_BERR_I 25 /* Backward error (these two are the same) */ -#define SRCOND_A_I 26 /* Single precision rcond for matrix A */ -#define ALPHA_I 27 /* Angle between r and b */ -#define COS_ALPHA_I 28 /* Cosine of the angle between r and b */ -#define DRCOND_I 29 /* Double precision rcond of equilibrated, - augmented system, obtained from DGESVXX */ -#define DBL_RES_COUNT_I 30 /* Number of iterations in old scheme */ -#define DBL_X_COUNT_I 31 /* Number of iterations in new scheme */ -#define DXRMAX_I 32 /* Max ratio of consecutive |dx| */ -#define DZRMAX_I 33 /* Max ratio of consecutive |dz| */ -#define SEED0_I 34 /* Seed use to generate this test case */ -#define SEED1_I 35 /* Seed use to generate this test case */ -#define SEED2_I 36 /* Seed use to generate this test case */ -#define SEED3_I 37 /* Seed use to generate this test case */ -#define TEST_TYPE_I 38 /* Test matrix type, small_piv * 10 + dist_mode */ -#define D_XBND_I 39 /* Normwise error bound, double precision */ -#define D_ZBND_I 40 /* Componentwise error bound, double precision */ -#define D_BERR_I 41 /* Backward error bound, double precision */ -#define FIRST_DX_I 42 /* First dx / x value */ -#define FIRST_DZ_Z_X_I 43 /* First |dz|/|z| value (after stabilized) */ -#define PREVX_I 44 /* Norm of previous x. */ -#define X_I 45 /* Norm of x. */ -#define SNGL_RES_COUNT_I 46 /* Count of b-Ax in single */ -#define TIME_I 47 /* total time */ -#define XMIN_I 48 /* min magnitude x entry */ -#define MAXTDIFF_I 49 /* largest abs(x[i] - x_truth[i]) */ -#define MAXTCDIFF_I 50 /* largest abs((x[i]-x_truth[i])/x_truth[i])*/ -#define MAXTRES_I 51 /* largest abs(b_dble - a_dble * x_truth) */ -#define RES_I 52 /* Largest residual */ -#define FINAL_DX_I 53 /* ||dx|| used in xbnd computation */ -#define FINAL_DZ_Z_X_I 54 /* dz_z used in zbnd computation */ -#define XTMIN_I 55 /* min abs. entry in x_truth */ -#define EQRES_I 56 /* residual for equilibrated system */ -#define YMIN_I 57 /* min abs. entry in equil. x */ -#define YERR_I 58 /* error in equil. x */ -#define Y_I 59 /* norm(y, inf) */ -#define XT_I 60 /* norm(x_truth, inf) */ -#define WORST_X_I 61 /* component with worst relative error */ -#define WORST_C_I 62 /* column scaling c[k] for worst component */ -#define XSTOPCNT_I 63 /* count where X stops */ -#define ZSTOPCNT_I 64 /* count where Z stops */ -#define RANDX_I 65 /* X generation method */ -#define XCOND_I 66 /* Target X condition number */ -#define YTCOND_I 67 /* True y_max / y_min number */ -#define FLAGS_I 68 /* FP flags for refinement */ -#define INITZERR_I 69 /* Initial componentwise error */ -#define INITXERR_I 70 /* Initial normwise error */ -#define FERR_X_I 71 /* Loose forward error bound for x */ -#define FERR_R_I 72 /* Loose forward error bound for r */ -#define X_RCOND_I 73 /* Higham's condition number of X */ -#define LSQ_SIN_I 74 /* Computed sin of the angle between b and r */ -#define TOTAL_TIME_I 75 /* Total time of execution */ -#define ITREF_TIME_I 76 /* Time spent in iterative refinement */ -#define ROWDIM_I 77 /* Row dimension */ -#define COLDIM_I 78 /* Column dimension */ -#define NORMB_I 79 /* norm of the rhs B */ -#define NORMR_I 80 /* norm of the residual R */ - -#define RATIO_UB_I 86 /* rthresh bound to be used */ -#define ITMAX_I 87 /* iteration maximum */ -#define ADD_LAST_DX_I 88 /* whether to add the last dx */ -#define VERBOSE_I 89 /* verbosity level */ -#define DZ_UB_I 90 -#define SCHEME_IN_I 91 -#define SCHEME_OUT_I 92 - -extern const char *lsq_data_names []; - -#endif diff --git a/lsq_itref.h b/lsq_itref.h index 3886c4a..4d26a47 100644 --- a/lsq_itref.h +++ b/lsq_itref.h @@ -78,6 +78,7 @@ struct option_type { int rand_scaling; int rand_rowscl; enum blas_trans_type trans; + int wrtr; }; /* Utility functions */ @@ -91,7 +92,7 @@ void randdata(int n, int d, int k, float *a, float eta, int *iseed); void randdata2(int m, int n, int r, int mode, float cond, float *a, int *iseed); -int readdata(const char *fname, int *n, float **a, float **b, float **x); +int readdata(const char *fname, int *m, int *n, float **a, float **b, float **x); void piv_matrix(int n, int *seed, float *a, float *b); /* Internal routines */ diff --git a/main.c b/main.c index ebf066d..6d8141e 100644 --- a/main.c +++ b/main.c @@ -5,7 +5,7 @@ #include #include #include -#include "lin/fpu.h" +#include "sgesvxx/fpu.h" #include "lsq_itref.h" void @@ -14,6 +14,7 @@ print_usage() puts("driver [options] N"); puts(" N Runs N iterative refinement experiments."); puts(" N defaults to 1."); + puts(" -wrtr use dr/r rather than dr/b"); puts(" -eps x Sets both eps1 and eps2 to 2^x. Defaults to random x."); puts(" -eps1 x Sets eps1 to 2^x."); puts(" -eps2 x Sets eps2 to 2^x."); @@ -122,6 +123,7 @@ init_options(struct option_type *options) options->rand_scaling = 1; options->rand_rowscl = 0; options->trans = blas_no_trans; + options->wrtr = 1; } int @@ -140,6 +142,8 @@ main(int argc, char **argv) if (strcmp(arg, "-h") == 0) { print_usage(); exit(0); + } else if (strcmp(arg, "-wrtr") == 0) { + options.wrtr = 0; } else if (strcmp(arg, "-eq") == 0) { options.use_eq = 1; } else if (strcmp(arg, "-noeq") == 0) { diff --git a/make-indices b/make-indices new file mode 100755 index 0000000..aafb423 --- /dev/null +++ b/make-indices @@ -0,0 +1,118 @@ +#!/usr/bin/perl -w + + +my %name; +my %comment; + +my $mode = ""; + +my @outname; +my @outcomment; +my @inname; +my @incomment; + +$name{"output"} = \@outname; +$comment{"output"} = \@outcomment; +$name{"input"} = \@inname; +$comment{"input"} = \@incomment; + +open INDICES, "indices.in" or die "Couldn't open indices.in"; +while () { + chomp; + next if (/^$/); + + if (/#(\S+)/) { + $mode = $1; + next; + } + + if (/^(\S+)(?:\s+"(.+)")?/) { + push @{$name{$mode}}, $1; + if (defined $2) { + push @{$comment{$mode}}, $2; + } + else { + print STDERR "Warning: No comment for $1\n"; + push @{$comment{$mode}}, "XXX Needs comment"; + } + } +} +close INDICES; + +my $nout = scalar @outname; +my $nin = scalar @inname; + +my $inbase = 100*(int($nout / 100)+1); +my $debug_data_size = $inbase + $nin; + +open LSQ_INDICES_H, "> lsq_indices.h.tmp" or die "Cannot open lsq_indices.h"; + +print LSQ_INDICES_H < indices.c.tmp" or die "Cannot open indices.c"; +print INDICES_C < G ) ERRBND(11, J) = MAX(ERRBND(11, J), S) -! ERRBND(11, J) = (W3 + W7) / NORMR + ERRBND(11, J) = (W3 + W7) ERRBND(12, J) = W4 + W8 * Simpler formula for K_norm_r, which is cheaper to compute -!XSL - ERRBND(13, J) = (W0 + W7) / NORMB - IF ( S > G ) ERRBND(13, J) = MAX(ERRBND(13, J), S) -! ERRBND(13, J) = (W0 + W7) / NORMR +! print*, 'W0 = ', W0, ' W7 = ', W7, ' NORMB = ', NORMB + ERRBND(13, J) = (W0 + W7) ERRBND(14, J) = W00 + W8 - call debug_normb(normb) - call debug_normr(normr) + + call debug_save_ws(w0,w00,w1,w2,w3,w4,w5,w6,w7,w8) * * End of loop for each RHS * diff --git a/sgels_x.f b/sgels_x.f index fa09be9..258e75e 100644 --- a/sgels_x.f +++ b/sgels_x.f @@ -213,7 +213,6 @@ LOGICAL LQUERY, TPSD INTEGER BROW, I, IASCL, IBSCL, J, MN, NB, SCLLEN, $ WSIZE, M_N - REAL ANRM, BIGNUM, BNRM, SMLNUM REAL ANRM1, ANRMI, BNORM, ALPHA * * Reciprocal condition number of A and the augmented system @@ -312,57 +311,6 @@ RETURN END IF * -* Get machine parameters -* - SMLNUM = SLAMCH( 'S' ) / SLAMCH( 'P' ) - BIGNUM = ONE / SMLNUM - CALL SLABAD( SMLNUM, BIGNUM ) -* -* Scale A, B if max element outside range [SMLNUM,BIGNUM] -* - ANRM = SLANGE( 'M', M, N, A, LDA, RWORK ) - IASCL = 0 - IF( ANRM.GT.ZERO .AND. ANRM.LT.SMLNUM ) THEN -* -* Scale matrix norm up to SMLNUM -* - CALL SLASCL( 'G', 0, 0, ANRM, SMLNUM, M, N, A, LDA, INFO ) - IASCL = 1 - ELSE IF( ANRM.GT.BIGNUM ) THEN -* -* Scale matrix norm down to BIGNUM -* - CALL SLASCL( 'G', 0, 0, ANRM, BIGNUM, M, N, A, LDA, INFO ) - IASCL = 2 - ELSE IF( ANRM.EQ.ZERO ) THEN -* -* Matrix all zero. Return zero solution. -* - CALL SLASET( 'F', MAX( M, N ), NRHS, ZERO, ZERO, B, LDB ) - GO TO 50 - END IF -* - BROW = M - IF( TPSD ) - $ BROW = N - BNRM = SLANGE( 'M', BROW, NRHS, B, LDB, RWORK ) - IBSCL = 0 - IF( BNRM.GT.ZERO .AND. BNRM.LT.SMLNUM ) THEN -* -* Scale matrix norm up to SMLNUM -* - CALL SLASCL( 'G', 0, 0, BNRM, SMLNUM, BROW, NRHS, B, LDB, - $ INFO ) - IBSCL = 1 - ELSE IF( BNRM.GT.BIGNUM ) THEN -* -* Scale matrix norm down to BIGNUM -* - CALL SLASCL( 'G', 0, 0, BNRM, BIGNUM, BROW, NRHS, B, LDB, - $ INFO ) - IBSCL = 2 - END IF -* * Make a copy of A and the righ-hand side matrix B * CALL SLACPY( 'Full', M, NRHS, B, LDB, X, LDX ) @@ -506,6 +454,7 @@ * ALPHA = RCOND_A * ANRMI / 1.4142 ALPHA = 2.0 ** INT( LOG(ALPHA) / LOG(2.0) ) + call debug_alpha(alpha) * alpha = one * print*, 'SGELS_X: ANRMI ', anrmi, ' ALPHA ', alpha * @@ -543,23 +492,6 @@ * END IF * -* Undo scaling -* - IF( IASCL.EQ.1 ) THEN - CALL SLASCL( 'G', 0, 0, ANRM, SMLNUM, SCLLEN, NRHS, X, LDX, - $ INFO ) - ELSE IF( IASCL.EQ.2 ) THEN - CALL SLASCL( 'G', 0, 0, ANRM, BIGNUM, SCLLEN, NRHS, X, LDX, - $ INFO ) - END IF - IF( IBSCL.EQ.1 ) THEN - CALL SLASCL( 'G', 0, 0, SMLNUM, BNRM, SCLLEN, NRHS, X, LDX, - $ INFO ) - ELSE IF( IBSCL.EQ.2 ) THEN - CALL SLASCL( 'G', 0, 0, BIGNUM, BNRM, SCLLEN, NRHS, X, LDX, - $ INFO ) - END IF -* 50 CONTINUE WORK( 1 ) = REAL( WSIZE ) * diff --git a/sgesvxx/debug.fh b/sgesvxx/debug.fh index 1bda762..a666bde 100644 --- a/sgesvxx/debug.fh +++ b/sgesvxx/debug.fh @@ -24,3 +24,6 @@ external debug_berr_time external debug_init_count, debug_incr_count external debug_xstop, debug_zstop, debug_finstate + external debug_rcond, debug_alpha, debug_save_ws + external debug_wrtr + integer debug_wrtr diff --git a/sgesvxx/make.inc-ejr-home b/sgesvxx/make.inc-ejr-home index a50651a..990d6c3 100644 --- a/sgesvxx/make.inc-ejr-home +++ b/sgesvxx/make.inc-ejr-home @@ -1,52 +1,38 @@ # Copyright (c) 2005, Regents of the University of California # See COPYING file for license. -# IA64 with Intel's 9 compilers and MKL 8. -# -# Uses installation in ejr's home directory. -# -PLAT=ia64-unknown-linux-gnu -HL=/home/cs/ejr/.local/ia64-unknown-linux-gnu -INC_DIR=-I/project/cs/demmel/b/projects/xblas/include -I. -LIB_DIR=-L/project/cs/demmel/b/projects/xblas/$(PLAT)/lib \ - -L$(HL)/lib \ - -L/project/cs/demmel/b/projects/lapack/$(PLAT)/lib.icc9 - -LIBS= -lxblas -ltmg -lmkl_lapack -lmkl -lgmp -lm -lrt -lguide -#LIBS= -lxblas -ltmg -llapack-3 -lmkl -lgmp -lm -lrt -lguide -lg2c -#LIBS= -lxblas -ltmg -llapack-3 -lblas-3 -lgmp -lm -lrt -lg2c - -CC=$(HL)/bin/icc -CFLAGS= -DUSE_F2C_CONVENTIONS -O -F77=$(HL)/bin/ifort -FFLAGS=-nofor_main -no-ftz -fno-alias -funroll-loops -O3 $(PROFFLAGS) -NO_OPTS=-g -nofor_main -mp -no-ftz -IPF_flt_eval_method0 -LD=$(F77) -nofor_main -FC=$(F77) -FCFLAGS=$(FFLAGS) -FREE=-free -FIXED=-fixed -ARCH=ar r +# Intel x86 platform with gcc compiler +PLAT=i686-pc-linux-gnu +INC_DIR=-I/home/ejr/Projects/xblas/local/support -I. +LIB_DIR=-L/home/ejr/Projects/xblas/local/support +LIBS= -lxblas -ltmg -llapack-3 -lblas-3 -lm -lrt -lg2c +#LIBS= -lxblas -ltmg /opt/acml3.1.0/gfortran32_nosse2/lib/libacml.a -lm -lrt -lg2c + +CC=gcc +CFLAGS=-O2 -Dx86 -g -Wall +F77=gfortran -ff2c +FFLAGS=-O2 -g +NO_OPTS=-O0 +LD=$(F77) + +FTNCHEK=ftnchek + +FTNCHEK_ARGS=-usage=no-arg-array-alias -array=no-dimensions -nonovice +FTNCHEK_SRC=c2f_fside.f dgeequb.f dgerfsx.f dgesvxx.f dla_geamv.f dla_gerfsx_extended.f dla_gerfsx_normest.f dla_gerfsx_working.f dla_wwaddw.f dlascl2.f f77_util.f la_utils.f sgeequb.f sgerfsx.f sgesvxx.f sla_geamv.f sla_gerfsx_extended.f sla_gerfsx_normest.f sla_gerfsx_wilk.f sla_gerfsx_working.f sla_wwaddw.f slagge2.f slascl2.f + +ARCH=ar +ARCHFLAGS=cr RANLIB=ranlib -ifdef PROFILING -CC += -qp -F77 += -qp -endif - -ifndef NOPROFOPT -ifdef PROFOPT -PROF_DIR=$(TOPLEV)/icc-prof -PROFFLAGS= -ifeq ($(PROFOPT),gen) -PROFFLAGS=-prof-dir $(PROF_DIR) -prof-gen -endif -ifeq ($(PROFOPT),use) -PROFFLAGS=-prof-dir $(PROF_DIR) -prof-use -endif -endif -else -PROFFLAGS= -endif - -FTNCHEK= +# tell make what a project file suffix is +.SUFFIXES: .prj + +# these options suppress global checks. +FTNCHEK_NOGLOBAL=-usage=no-ext-undefined,no-com-\* + +# tell make how to create a .prj file from a .f file +.f.prj: + $(FTNCHEK) -project $(FTNCHEK_NOGLOBAL) $(FTNCHEK_ARGS) -library $< + +# set up macro PRJS containing project filenames +PRJS= $(FTNCHEK_SRC:.f=.prj) diff --git a/sgesvxx/make.x86-ejr.gfortran b/sgesvxx/make.x86-ejr.gfortran index d1819cb..990d6c3 100644 --- a/sgesvxx/make.x86-ejr.gfortran +++ b/sgesvxx/make.x86-ejr.gfortran @@ -4,7 +4,8 @@ PLAT=i686-pc-linux-gnu INC_DIR=-I/home/ejr/Projects/xblas/local/support -I. LIB_DIR=-L/home/ejr/Projects/xblas/local/support -LIBS= -lxblas -ltmg /opt/acml3.1.0/gfortran32_nosse2/lib/libacml.a -lm -lrt -lg2c +LIBS= -lxblas -ltmg -llapack-3 -lblas-3 -lm -lrt -lg2c +#LIBS= -lxblas -ltmg /opt/acml3.1.0/gfortran32_nosse2/lib/libacml.a -lm -lrt -lg2c CC=gcc CFLAGS=-O2 -Dx86 -g -Wall diff --git a/sgesvxx/timer.c b/sgesvxx/timer.c index a15989e..98f807b 100644 --- a/sgesvxx/timer.c +++ b/sgesvxx/timer.c @@ -1,3 +1,5 @@ +#include + #include #include "config.h" @@ -9,12 +11,51 @@ void tic(struct timespec *tv) { double toc(struct timespec *tv) { struct timespec tv2; - double sec, nsec; if (clock_gettime(SAMPLED_CLOCK, &tv2)) tv2.tv_sec = tv2.tv_nsec = -1; - sec = (double) (tv2.tv_sec - tv->tv_sec); - nsec = (double) (tv2.tv_nsec - tv->tv_nsec); - + double sec = (double) (tv2.tv_sec - tv->tv_sec); + double nsec = (double) (tv2.tv_nsec - tv->tv_nsec); + return (sec + 1.0e-9 * nsec); } +#ifdef USE_F2C_CONVENTIONS +typedef double f2c_float_ret; +#else +typedef float f2c_float_ret; +#endif + +/* Cheat and "normalize" by a base time. Reduces errors when using + floats (SECOND). +*/ +static double basetic = -1.0; + +f2c_float_ret +F77_FUNC(second,SECOND) (void) +{ + struct timespec tv; + double out; + if (basetic < 0) { + if (clock_gettime(SAMPLED_CLOCK, &tv)) return -1.0; + basetic = (double)tv.tv_sec + 1.0e-9*tv.tv_nsec; + return 0.0; + } + if (clock_gettime(SAMPLED_CLOCK, &tv)) return -1.0; + out = ((double)tv.tv_sec + 1.0e-9*tv.tv_nsec) - basetic; + return (f2c_float_ret)out; +} + +double +F77_FUNC(dsecnd,DSECND) (void) +{ + struct timespec tv; + double out; + if (basetic < 0) { + if (clock_gettime(SAMPLED_CLOCK, &tv)) return -1.0; + basetic = (double)tv.tv_sec + 1.0e-9*tv.tv_nsec; + return 0.0; + } + if (clock_gettime(SAMPLED_CLOCK, &tv)) return -1.0; + out = ((double)tv.tv_sec + 1.0e-9*tv.tv_nsec) - basetic; + return out; +} diff --git a/slsq_driver.c b/slsq_driver.c index 86cc35a..3e95cd8 100644 --- a/slsq_driver.c +++ b/slsq_driver.c @@ -57,6 +57,7 @@ void slsq_driver(struct option_type *options) { dbg->debug_data[ITMAX_I] = options->itmax; dbg->debug_data[ADD_LAST_DX_I] = options->add_last_dx; dbg->debug_data[DZ_UB_I] = options->dz_ub; + dbg->debug_data[WRTR_I] = options->wrtr; if (verbose > 0) { printf(" nr_iter = %d\n", options->nr_iter); @@ -68,8 +69,7 @@ void slsq_driver(struct option_type *options) { if (use_readdata) { /* Read A, b, and x from a file. The x read in will be treated as a true solution. */ - int err = readdata(options->fname, &n, &a, &b, &x); - m = n; + int err = readdata(options->fname, &m, &n, &a, &b, &x); if (err) { fprintf(stderr, "Invalid file.\n"); exit(-1); @@ -267,41 +267,26 @@ void slsq_driver(struct option_type *options) { if ( alpha != 0. ) { for (i = 0; i < m; ++i) b2[i] /= alpha; } -#if 0 - /* generate random alpha */ - alpha = c_slaran(seed) * pi / 2; /* uniform in (0, pi/2) */ + + /* Generate random angle between A*x and b */ + expnt = c_slaran(seed); /* U(0,1) */ + expnt = 25*expnt; /* U(0, 25) */ + expnt = expnt - 26; /* U(-26, -1) */ + alpha = M_PI * pow(2.0, expnt); /* pi * 2**U(-26, -1) */ + c = cos(alpha); -#else - if ( k % cos_01_freq == 0 ) { - c = 0.0; /* consistent system */ - } else if ( k % (2*cos_01_freq) == 0 ) { - c = 1.0; /* r.h.s. orthogonal to range of A */ - } else { - /* generate random cos(alpha) */ - c = c_slaran(seed); /* uniform in (0,1) */ - } - alpha = acos(c); -#endif - s = sin(alpha); -#if 0 -#if 0 - /* b is in range(A), consistent system, bad for r. */ - alpha = 0.0; - s = 0.0; c = 1.0; -#else - /* b is orthogonal to range(A), bad for x. */ - alpha = pi / 2.0; - s = 1.0; c = 0.0; -#endif -#endif + /* extra precision preserves identity c**2+s**2=1 */ + s = sqrtl(1.0 - cosl(alpha)*cosl(alpha)); + dbg->debug_data[ALPHA_I] = alpha; dbg->debug_data[COS_ALPHA_I] = c; - for (i = 0; i < m; ++i) b[i] = c * b1[i] + s * b2[i]; + for (i = 0; i < m; ++i) b[i] = (double)(c * (long double)b1[i] + + s * (long double)b2[i]); if ( verbose >= 2 ) sprint_vector(b, m, "b"); if ( verbose ) { float t = 0; - printf("RHS: alpha (angle between b & b1) = %e, sin = %e\n", - alpha, s); + printf("RHS: alpha (angle between b & b1) = %e*pi/2, sin = %e\n", + 2.0*alpha/M_PI, s); for (i = 0, t = 0.; i < m; ++i) t += b1[i] * b[i]; printf("b1 & b orthogonality %e\n", t); } @@ -370,27 +355,6 @@ void slsq_driver(struct option_type *options) { if ( options->rand_b ) for (j = 0; j < n; ++j) x[j] = x_truth[j]; - - { - double tmp, maxdiff, maxcdiff, xmin, xmax; - xmin = fabs(x_truth[0]); - xmax = fabs(x_truth[0]); - maxdiff = 0.0; - maxcdiff = 0.0; - for (i = 0; i < n; ++i) { - tmp = fabs(x_truth[i]); - if (tmp < xmin) xmin = tmp; - if (tmp > xmax) xmax = tmp; - tmp = fabs(x_truth[i] - (double)x[i]); - if (tmp > maxdiff) maxdiff = tmp; - tmp /= fabs(x_truth[i]); - if (tmp > maxcdiff) maxcdiff = tmp; - } - dbg->debug_data[MAXTDIFF_I] = (float)maxdiff; - dbg->debug_data[MAXTCDIFF_I] = (float)maxcdiff; - dbg->debug_data[XTMIN_I] = (float)xmin; - dbg->debug_data[XT_I] = (float)xmax; - } #if 0 { /* Another way of computing r_truth */ @@ -405,7 +369,6 @@ void slsq_driver(struct option_type *options) { tmp = fabs(b_dble[i]); if (tmp > maxres) maxres = tmp; } - dbg->debug_data[MAXTRES_I] = (float)maxres; if (verbose > 0) { dprint_vector(b_dble, m, "r_truth = b - A*x_truth"); } diff --git a/slsq_get_truth.c b/slsq_get_truth.c index f5ff559..92b1d3a 100644 --- a/slsq_get_truth.c +++ b/slsq_get_truth.c @@ -66,6 +66,8 @@ slsq_get_truth(struct option_type *options, int m, int n, float alpha, bu, m_n, xu, m_n, rcond, &rpvgrw, berr, n_err_bnds, err_bnds, 0, NULL); + *ferr = err_bnds[/*(1,1,1) = */ 0 ]; + *ferr_cmp = err_bnds[/*(1,2,1) = */ 1 ]; #endif #else info = c_dgesvxx('N', 'N', m_n, 1, au, m_n, auf, m_n, ipiv, &equed, diff --git a/slsq_itref_test.c b/slsq_itref_test.c index 115e46c..812df8e 100644 --- a/slsq_itref_test.c +++ b/slsq_itref_test.c @@ -9,6 +9,101 @@ #include "lsq_indices.h" #include "timer.h" +static float +norminff(const int m, const float *x) +{ + int i; + float out = 0.0; + for (i = 0; i < m; ++i) { + const float tmp = fabsf(x[i]); + if (tmp > out) out = tmp; + } + return out; +} + +static double +norminf(const int m, const double *x) +{ + int i; + double out = 0.0; + for (i = 0; i < m; ++i) { + const double tmp = fabs(x[i]); + if (tmp > out) out = tmp; + } + return out; +} + +static long double +norminf_diff_fd(const int m, const float *a, const double *b) +{ + int i; + long double out = 0.0; + for (i = 0; i < m; ++i) { + const long double ai = a[i]; + const long double bi = b[i]; + const long double diff = fabsl(ai-bi); + if (diff > out) out = diff; + } + return out; +} + +static long double +norminf_diff_scaled_fd(const int m, const float *a, const double *b) +{ + int i; + long double out = 0.0; + for (i = 0; i < m; ++i) { + const long double ai = a[i]; + const long double bi = b[i]; + const long double ci = fabsl(bi); + const long double diff = fabsl(ai-bi); + + if (ci != 0.0) { + const long double rat = diff / ci; + if (rat > out) out = rat; + } + else if (diff != 0.0) + out = HUGE_VAL; + } + return out; +} + +static float +norm2f(const int m, const float *x) +{ + int i; + long double out = 0.0; + for (i = 0; i < m; ++i) { + const long double tmp = x[i]; + out += tmp*tmp; + } + return out; +} + +static double +norm2(const int m, const double *x) +{ + int i; + long double out = 0.0; + for (i = 0; i < m; ++i) { + const long double tmp = x[i]; + out += tmp*tmp; + } + return out; +} + +static double +minmag(const int m, const double *x) +{ + int i; + double out = HUGE_VAL; + for (i = 0; i < m; ++i) { + const double axi = fabs(x[i]); + if (axi < out) out = axi; + } + return out; +} + /* Runs iterative refinement and compares to the true solution. */ void @@ -17,9 +112,8 @@ slsq_itref_test(int m, int n, float *a, float *b, struct option_type *options) { char itref; - float *c, *r, *res, *y; + float *res; int mn = m * n, max_m_n, nrhs = 1, i; - float rcond; float *x, *af; int info; int verbose = dbg->debug_data[VERBOSE_I]; @@ -30,11 +124,8 @@ slsq_itref_test(int m, int n, float *a, float *b, enum blas_trans_type trans = options->trans; max_m_n = MAX(m, n); - c = MALLOC(float, n); - r = MALLOC(float, max_m_n); x = MALLOC(float, max_m_n); res = malloc(max_m_n * sizeof(float)); - y = malloc(max_m_n * sizeof(float)); af = malloc(mn * sizeof(float)); errbnd = calloc (20 * nrhs, sizeof(float)); @@ -70,7 +161,10 @@ slsq_itref_test(int m, int n, float *a, float *b, dbg->debug_data[X_BERR_I] = 1.0; dbg->debug_data[R_BERR_I] = 1.0; dbg->debug_data[FLAGS_I] = 0.0; - + + dbg->debug_data[NORM_B_I] = norminff(m, b); + dbg->debug_data[NORM2_B_I] = norm2f(m, b); + { struct timespec tv; @@ -79,8 +173,99 @@ slsq_itref_test(int m, int n, float *a, float *b, a, m, af, m, b, m, x, m, res, m, nparams, ¶ms[0], &errbnd[0]); dbg->debug_data[TOTAL_TIME_I] = toc(&tv); + + dbg->debug_data[XBND_I] = errbnd[0]; + dbg->debug_data[ZX_BND_I] = errbnd[1]; + dbg->debug_data[RBND_I] = errbnd[2]; + dbg->debug_data[ZR_BND_I] = errbnd[3]; + dbg->debug_data[INFO_I] = (float) info; + dbg->debug_data[X_BERR_I] = errbnd[4]; + dbg->debug_data[R_BERR_I] = errbnd[5]; + dbg->debug_data[XCOND_NORM_I] = errbnd[8]; + dbg->debug_data[XCOND_COMP_I] = errbnd[9]; + dbg->debug_data[RCOND_NORM_I] = errbnd[10]; + dbg->debug_data[RCOND_COMP_I] = errbnd[11]; + dbg->debug_data[NCONDR_CHEAP_I] = errbnd[12]; + dbg->debug_data[CCONDR_CHEAP_I] = errbnd[13]; + if (verbose > 0) { + printf(" norm_cond_r_cheap = %16.8e\n", errbnd[12]); + printf(" comp_cond_r_cheap = %16.8e\n", errbnd[13]); + } } - + + + /* Error in Computed X, norms of True and Computed X */ + dbg->debug_data[NORM_XTRUE_I] = norminf(n, x_truth); + dbg->debug_data[NORM2_XTRUE_I] = norm2(n, x_truth); + dbg->debug_data[NORM_X_I] = norminff(n, x); + dbg->debug_data[NORM2_X_I] = norm2f(n, x); + dbg->debug_data[XERR_I] = norminf_diff_fd(n, x, x_truth) + / dbg->debug_data[NORM_XTRUE_I]; + dbg->debug_data[ZX_ERR_I] = norminf_diff_scaled_fd(n, x, x_truth); + + dbg->debug_data[XTMIN_I] = minmag(n, x_truth); + + /* Error in Computed R, norms of True and Computed R */ + dbg->debug_data[NORM_RTRUE_I] = norminf(m, r_truth); + dbg->debug_data[NORM2_RTRUE_I] = norm2(m, r_truth); + dbg->debug_data[NORM_R_I] = norminff(m, res); + dbg->debug_data[NORM2_R_I] = norm2f(m, res); + dbg->debug_data[RERR_I] = norminf_diff_fd(m, res, r_truth) + / dbg->debug_data[NORM_RTRUE_I]; + dbg->debug_data[RERRB_I] = norminf_diff_fd(m, res, r_truth) + / dbg->debug_data[NORM_B_I]; + dbg->debug_data[ZR_ERR_I] = norminf_diff_scaled_fd(n, res, r_truth); + + dbg->debug_data[RTMIN_I] = minmag(m, r_truth); + + { /* Computed X and R: norm(A*X,2)**2, R'*(A*X), B'*(A*X) */ + long double norm2Ax = 0.0, rTAx = 0.0, bTAx = 0.0; + double *Ax; + Ax = malloc(m * sizeof(double)); + BLAS_dgemv_s_s_x(blas_colmajor, blas_no_trans, m, n, 1.0, a, m, x, 1, + 0.0, Ax, 1, blas_prec_extra); + for (i = 0; i < m; ++i) { + const long double tmp = fabs(Ax[i]); + norm2Ax += tmp*tmp; + rTAx += Ax[i]*(long double)res[i]; + bTAx += Ax[i]*(long double)b[i]; + } + dbg->debug_data[NORM2_AX_I] = norm2Ax; + dbg->debug_data[RT_AX_I] = rTAx; + dbg->debug_data[BT_AX_I] = bTAx; + free(Ax); + } + + {/* True X and R: norm(A*X,2)**2, R'*(A*X), B'*(A*X) */ + long double norm2Ax = 0.0, rTAx = 0.0, bTAx = 0.0; + double *Ax; + Ax = malloc(m * sizeof(double)); + BLAS_dgemv_s_d_x(blas_colmajor, blas_no_trans, m, n, 1.0, a, m, + x_truth, 1, 0.0, Ax, 1, blas_prec_extra); + for (i = 0; i < m; ++i) { + const long double tmp = fabs(Ax[i]); + norm2Ax += tmp*tmp; + rTAx += Ax[i]*(long double)r_truth[i]; + bTAx += Ax[i]*(long double)b[i]; + } + dbg->debug_data[NORM2_AXTRUE_I] = norm2Ax; + dbg->debug_data[RTRUET_AXTRUE_I] = rTAx; + dbg->debug_data[BT_AXTRUE_I] = bTAx; + free(Ax); + } + + { /* norm(A'*R, inf), true and computed R */ + double *Atr; + Atr = malloc(n*sizeof(double)); + BLAS_dgemv_s_s_x(blas_colmajor, blas_trans, m, n, 1.0, a, m, + res, 1.0, 0.0, Atr, 1, blas_prec_extra); + dbg->debug_data[NORM_ATR_I] = norm2(n, Atr); + BLAS_dgemv_s_d_x(blas_colmajor, blas_trans, m, n, 1.0, a, m, + r_truth, 1.0, 0.0, Atr, 1, blas_prec_extra); + dbg->debug_data[NORM_ATRTRUE_I] = norm2(n, Atr); + free(Atr); + } + if ( verbose > 0 ) { if (info) printf(" SLSGE_X unsuccessful: %d\n", info); @@ -93,129 +278,7 @@ slsq_itref_test(int m, int n, float *a, float *b, /*sprint_vector(c, n, "c"); sprint_vector(r, n, "r");*/ } - - { - float axi, ymin = HUGE_VAL, xmin = HUGE_VAL; - float maxres, maxeqres, ymax = -0.0, normb, tmp; - double x_top = 0.0, r_top = 0.0; - double y_top = 0.0; - double xcmp_max = 0.0, rcmp_max = 0.0; - double xcmp_min = 1.0e37; - double err, cmp_err; - double yt, ytmin = HUGE_VAL, ytmax = 0.0; - int max_i = 0; - - for (i = 0; i < n; i++) { - yt = fabs(x_truth[i]); - if (yt > ytmax) ytmax = yt; - if (yt < ytmin) ytmin = yt; - - y[i] = x[i]; - axi = fabs(x[i]); - if (axi < xmin) xmin = axi; - axi = fabs(y[i]); - if (axi < ymin) ymin = axi; - if (axi > ymax) ymax = axi; - err = fabs(x_truth[i] - x[i]); - cmp_err = err / fabs(x_truth[i]); -#if 0 - printf("XT[%d] = %20.15g\tX[%d] = %20.15g %s\n", - i,x_truth[i],i,x[i], (cmp_err > 1.0e-4? "(*)":"")); -#endif - if (cmp_err > xcmp_max) { - xcmp_max = cmp_err; - max_i = i; - } else if (cmp_err < xcmp_min) { - xcmp_min = cmp_err; - } - x_top = MAX(x_top, err); - if (err > y_top) y_top = err; - } - dbg->debug_data[XMIN_I] = xmin; - dbg->debug_data[YMIN_I] = ymin; - dbg->debug_data[Y_I] = ymax; - dbg->debug_data[ZX_ERR_I] = xcmp_max; - if (dbg->debug_data[XT_I] != 0.0) - dbg->debug_data[XERR_I] = x_top / dbg->debug_data[XT_I]; - else if (x_top == 0.0) - dbg->debug_data[XERR_I] = 0.0; - else - dbg->debug_data[XERR_I] = HUGE_VAL; - - if (ymax != 0.0) - dbg->debug_data[YERR_I] = y_top / ymax; - else if (y_top == 0.0) - dbg->debug_data[YERR_I] = 0.0; - else - dbg->debug_data[YERR_I] = HUGE_VAL; - - dbg->debug_data[XBND_I] = errbnd[0]; - dbg->debug_data[ZX_BND_I] = errbnd[1]; - dbg->debug_data[RBND_I] = errbnd[2]; - dbg->debug_data[ZR_BND_I] = errbnd[3]; - dbg->debug_data[INFO_I] = (float) info; - dbg->debug_data[X_BERR_I] = errbnd[4]; - dbg->debug_data[R_BERR_I] = errbnd[5]; - dbg->debug_data[XCOND_NORM] = errbnd[8]; - dbg->debug_data[XCOND_COMP] = errbnd[9]; - dbg->debug_data[RCOND_NORM] = errbnd[10]; - dbg->debug_data[RCOND_COMP] = errbnd[11]; -#if 0 - printf(" norm_cond_r_cheap = %16.8e\n", errbnd[12]); - printf(" comp_cond_r_cheap = %16.8e\n", errbnd[13]); -#endif - dbg->debug_data[WORST_X_I] = fabs(x[max_i]); - dbg->debug_data[WORST_C_I] = c[max_i]; - - if (ytmin != 0.0) - dbg->debug_data[YTCOND_I] = ytmax / ytmin; - else if (ytmax == 0.0) - dbg->debug_data[YTCOND_I] = 0.0; - else - dbg->debug_data[YTCOND_I] = HUGE_VAL; - - /* Compute residual */ -#if 0 - memcpy(res, b, m*sizeof(float)); - BLAS_sgemm_x(blas_colmajor, blas_no_trans, blas_no_trans, - m, 1, n, -1.0, a, m, x, m, 1.0, res, m, blas_prec_extra); - if (verbose > 0) sprint_vector(res, m, "r = b - A*x"); -#else - if (verbose > 0) sprint_vector(res, m, "r_from_itref"); -#endif - - normb = maxres = maxeqres = -0.0; - for (i = 0; i < m; ++i) { - tmp = fabs( res[i] ); - if (tmp > maxeqres) maxeqres = tmp; - if (tmp > maxres) maxres = tmp; - tmp = fabs( b[i] ); - if (tmp > normb) normb = tmp; - err = fabs(r_truth[i] - res[i]); - r_top = MAX(r_top, err); - cmp_err = err / fabs(r_truth[i]); - if ( cmp_err > rcmp_max ) rcmp_max = cmp_err; - } -#if 1 - dbg->debug_data[RERR_I] = (float)r_top / normb; -#else /* XSL */ - dbg->debug_data[RERR_I] = (float)r_top / maxres; -#endif - dbg->debug_data[ZR_ERR_I] = rcmp_max; -#if 0 - if ( dbg->debug_data[RERR_I] > 1000.* ferr_r ) { - printf("R true err = %e\nR err bound = %e\n", - dbg->debug_data[RERR_I], ferr_r); - } -#endif - dbg->debug_data[RES_I] = (float)maxres; - dbg->debug_data[EQRES_I] = (float)maxeqres; - } - free(res); - free(y); - free(c); - free(r); free(x); free(af); free(errbnd);