From 3edb25d1dc75c18c299b372c451d12fe13a9fb67 Mon Sep 17 00:00:00 2001 From: Massimiliano Fasi Date: Thu, 30 May 2024 15:38:26 +0100 Subject: [PATCH] Add support for saturation mode --- mex/cpfloat.c | 68 ++++++++++--- mex/cpfloat.m | 16 ++- src/cpfloat_definitions.h | 39 +++++-- src/cpfloat_template.h | 33 +++--- test/cpfloat_test.m | 208 +++++++++++++++++++++++++++----------- test/cpfloat_test.ts | 7 +- 6 files changed, 264 insertions(+), 107 deletions(-) diff --git a/mex/cpfloat.c b/mex/cpfloat.c index 85eec0c..1320c8a 100644 --- a/mex/cpfloat.c +++ b/mex/cpfloat.c @@ -41,11 +41,14 @@ void mexFunction(int nlhs, fpopts->precision = 11; fpopts->emin = -14; fpopts->emax = 15; - fpopts->subnormal = CPFLOAT_SUBN_USE; fpopts->explim = CPFLOAT_EXPRANGE_TARG; fpopts->round = CPFLOAT_RND_NE; + fpopts->subnormal = CPFLOAT_SAT_NO; + fpopts->subnormal = CPFLOAT_SUBN_USE; + fpopts->flip = CPFLOAT_SOFTERR_NO; fpopts->p = 0.5; + fpopts->bitseed = NULL; fpopts->randseedf = NULL; fpopts->randseed = NULL; @@ -54,6 +57,7 @@ void mexFunction(int nlhs, /* Parse second argument and populate fpopts structure. */ if (nrhs > 1) { bool is_subn_rnd_default = false; + bool is_saturation_default = false; if(!mxIsEmpty(prhs[1]) && !mxIsStruct(prhs[1])) { mexErrMsgIdAndTxt("cpfloat:invalidstruct", "Second argument must be a struct."); @@ -62,7 +66,7 @@ void mexFunction(int nlhs, if (tmp != NULL) { if (mxGetM(tmp) == 0 && mxGetN(tmp) == 0) - /* Use default format, for compatibility with chop. */ + /* Set default format, for compatibility with chop. */ strcpy(fpopts->format, "h"); else if (mxGetClassID(tmp) == mxCHAR_CLASS) strcpy(fpopts->format, mxArrayToString(tmp)); @@ -80,6 +84,7 @@ void mexFunction(int nlhs, fpopts->precision = 4; fpopts->emin = -6; fpopts->emax = 8; + is_saturation_default = true; } else if (!strcmp(fpopts->format, "q52") || !strcmp(fpopts->format, "fp8-e5m2") || !strcmp(fpopts->format, "E5M2")) { @@ -161,6 +166,31 @@ void mexFunction(int nlhs, else if (mxGetClassID(tmp) == mxDOUBLE_CLASS) fpopts->round = *((double *)mxGetData(tmp)); } + tmp = mxGetField(prhs[1], 0, "saturation"); + if (tmp != NULL) { + if (mxGetM(tmp) == 0 && mxGetN(tmp) == 0) + fpopts->saturation = CPFLOAT_SAT_NO; + else if (mxGetClassID(tmp) == mxDOUBLE_CLASS) + fpopts->saturation = *((double *)mxGetData(tmp)); + } else { + if (is_saturation_default) + fpopts->saturation = CPFLOAT_SAT_USE; /* Default for E4M3. */ + else + fpopts->saturation = CPFLOAT_SAT_NO; + } + tmp = mxGetField(prhs[1], 0, "subnormal"); + if (tmp != NULL) { + if (mxGetM(tmp) == 0 && mxGetN(tmp) == 0) + fpopts->subnormal = CPFLOAT_SUBN_USE; + else if (mxGetClassID(tmp) == mxDOUBLE_CLASS) + fpopts->subnormal = *((double *)mxGetData(tmp)); + } else { + if (is_subn_rnd_default) + fpopts->subnormal = CPFLOAT_SUBN_RND; /* Default for bfloat16. */ + else + fpopts->subnormal = CPFLOAT_SUBN_USE; + } + tmp = mxGetField(prhs[1], 0, "flip"); if (tmp != NULL) { if (mxGetM(tmp) == 0 && mxGetN(tmp) == 0) @@ -288,10 +318,11 @@ void mexFunction(int nlhs, /* Allocate and return second output. */ if (nlhs > 1) { - const char* field_names[] = {"format", "params", "subnormal", "round", - "flip", "p", "explim"}; + const char* field_names[] = {"format", "params", "explim", + "round", "saturation", "subnormal", + "flip", "p"}; mwSize dims[2] = {1, 1}; - plhs[1] = mxCreateStructArray(2, dims, 7, field_names); + plhs[1] = mxCreateStructArray(2, dims, 8, field_names); mxSetFieldByNumber(plhs[1], 0, 0, mxCreateString(fpopts->format)); mxArray *outparams = mxCreateDoubleMatrix(1,3,mxREAL); @@ -301,30 +332,35 @@ void mexFunction(int nlhs, outparamsptr[2] = fpopts->emax; mxSetFieldByNumber(plhs[1], 0, 1, outparams); - mxArray *outsubnormal = mxCreateDoubleMatrix(1,1,mxREAL); - double *outsubnormalptr = mxGetData(outsubnormal); - outsubnormalptr[0] = fpopts->subnormal; - mxSetFieldByNumber(plhs[1], 0, 2, outsubnormal); + mxArray *outexplim = mxCreateDoubleMatrix(1, 1, mxREAL); + double *outexplimptr = mxGetData(outexplim); + outexplimptr[0] = fpopts->explim; + mxSetFieldByNumber(plhs[1], 0, 2, outexplim); mxArray *outround = mxCreateDoubleMatrix(1,1,mxREAL); double *outroundptr = mxGetData(outround); outroundptr[0] = fpopts->round; mxSetFieldByNumber(plhs[1], 0, 3, outround); + mxArray *outsaturation = mxCreateDoubleMatrix(1,1,mxREAL); + double *outsaturationptr = mxGetData(outsaturation); + outsaturationptr[0] = fpopts->saturation; + mxSetFieldByNumber(plhs[1], 0, 4, outsaturation); + + mxArray *outsubnormal = mxCreateDoubleMatrix(1,1,mxREAL); + double *outsubnormalptr = mxGetData(outsubnormal); + outsubnormalptr[0] = fpopts->subnormal; + mxSetFieldByNumber(plhs[1], 0, 5, outsubnormal); + mxArray *outflip = mxCreateDoubleMatrix(1,1,mxREAL); double *outflipptr = mxGetData(outflip); outflipptr[0] = fpopts->flip; - mxSetFieldByNumber(plhs[1], 0, 4, outflip); + mxSetFieldByNumber(plhs[1], 0, 6, outflip); mxArray *outp = mxCreateDoubleMatrix(1,1,mxREAL); double *outpptr = mxGetData(outp); outpptr[0] = fpopts->p; - mxSetFieldByNumber(plhs[1], 0, 5, outp); - - mxArray *outexplim = mxCreateDoubleMatrix(1,1,mxREAL); - double *outexplimptr = mxGetData(outexplim); - outexplimptr[0] = fpopts->explim; - mxSetFieldByNumber(plhs[1], 0, 6, outexplim); + mxSetFieldByNumber(plhs[1], 0, 7, outp); } if (nlhs > 2) diff --git a/mex/cpfloat.m b/mex/cpfloat.m index 4f1dfa3..4ea0196 100644 --- a/mex/cpfloat.m +++ b/mex/cpfloat.m @@ -39,11 +39,6 @@ % the target format, respectively. The default value of this field is % the vector [11,-14,15]. % -% * The scalar FPOPTS.subnormal specifies the support for subnormal numbers. -% The target floating-point format will not support subnormal numbers if -% this field is set to 0, and will support them otherwise. The default value -% for this field is 0 if the target format is 'bfloat16' and 1 otherwise. -% % * The scalar FPOPTS.explim specifies the support for an extended exponent % range. The target floating-point format will have the exponent range of % the storage format ('single' or 'double', depending on the class of X) if @@ -63,6 +58,17 @@ % Any other value results in no rounding. The default value for this field % is 1. % +% * The scalar FPOPTS.saturation specifies whether saturation arithmetic is in +% use. On overflow, the target floating-point format will use the largest +% representable floating-point if this field is set to 0, and infinity +% otherwise. The default value for this field is 1 if the target format is +% 'E4M3' and 1 otherwise. + +% * The scalar FPOPTS.subnormal specifies the support for subnormal numbers. +% The target floating-point format will not support subnormal numbers if +% this field is set to 0, and will support them otherwise. The default value +% for this field is 0 if the target format is 'bfloat16' and 1 otherwise. +% % * The scalar FPOPTS.flip specifies whether the function should simulate the % occurrence of a single bit flip striking the floating-point representation % of elements of Y. Possible values are: diff --git a/src/cpfloat_definitions.h b/src/cpfloat_definitions.h index 009963d..3c049ff 100644 --- a/src/cpfloat_definitions.h +++ b/src/cpfloat_definitions.h @@ -10,6 +10,7 @@ * * + @ref cpfloat_explim_t, * + @ref cpfloat_rounding_t, + * + @ref cpfloat_saturation_t, * + @ref cpfloat_softerr_t, * + @ref cpfloat_subnormal_t, * @@ -88,6 +89,16 @@ typedef enum { CPFLOAT_NO_RND = 8, } cpfloat_rounding_t; +/** + * @brief Saturation modes available in CPFloat. + */ +typedef enum { + /** Use standard arithmetic. */ + CPFLOAT_SAT_NO = 0, + /** Use saturation arithmetic. */ + CPFLOAT_SAT_USE = 1, +} cpfloat_saturation_t; + /** * @brief Soft fault simulation modes available in CPFloat. */ @@ -214,14 +225,6 @@ typedef struct { * exponent is larger than the maximum allowed by the storage format. */ cpfloat_exponent_t emax; - /** - * @brief Support for subnormal numbers in target format. - * - * @details Subnormal numbers are supported if this field is set to - * `CPFLOAT_SUBN_USE` and rounded to a normal number using the current - * rounding mode if it is set to `CPFLOAT_SUBN_RND`. - */ - cpfloat_subnormal_t subnormal; /** * @brief Support for extended exponents in target format. * @@ -256,6 +259,24 @@ typedef struct { * those in the list above is specified. */ cpfloat_rounding_t round; + /** + * @brief Support for subnormal numbers in target format. + * + * @details Subnormal numbers are supported if this field is set to + * `CPFLOAT_SUBN_USE` and rounded to a normal number using the current + * rounding mode if it is set to `CPFLOAT_SUBN_RND`. + */ + cpfloat_saturation_t saturation; + /** + * @brief Support for subnormal numbers in target format. + * + * @details Subnormal numbers are supported if this field is set to + * `CPFLOAT_SUBN_USE` and rounded to a normal number using the current + * rounding mode if it is set to `CPFLOAT_SUBN_RND`. + */ + cpfloat_subnormal_t subnormal; + + /* Bit flips. */ /** * @brief Support for soft errors. * @@ -281,6 +302,8 @@ typedef struct { * contain a number in the interval [0,1]. */ double p; + + /* Internal: state of pseudo-random number generator. */ /** * @brief Internal state of pseudo-random number generator for single bits. * diff --git a/src/cpfloat_template.h b/src/cpfloat_template.h index 426f445..8ba349d 100644 --- a/src/cpfloat_template.h +++ b/src/cpfloat_template.h @@ -163,6 +163,7 @@ typedef struct { cpfloat_subnormal_t subnormal; cpfloat_rounding_t round; FPTYPE ftzthreshold; + FPTYPE ofvalue; FPTYPE xmin; FPTYPE xmax; FPTYPE xbnd; @@ -317,15 +318,17 @@ static inline FPPARAMS COMPUTE_GLOBAL_PARAMS(const optstruct *fpopts, FPTYPE xmin = ldexp(1., emin); /* Smallest pos. normal. */ FPTYPE xmins = ldexp(1., emin-precision+1); /* Smallest pos. subnormal. */ FPTYPE ftzthreshold = (fpopts->subnormal == CPFLOAT_SUBN_USE) ? xmins : xmin; + FPTYPE xmax = ldexp(1., emax) * (2-ldexp(1., 1-precision)); FPTYPE xbnd = ldexp(1., emax) * (2-ldexp(1., -precision)); + FPTYPE ofvalue = (fpopts->saturation == CPFLOAT_SAT_USE) ? xmax : INFINITY; /* Bitmasks. */ INTTYPE leadmask = FULLMASK << (DEFPREC-precision); /* To keep. */ INTTYPE trailmask = leadmask ^ FULLMASK; /* To discard. */ FPPARAMS params = {precision, emax, emin, fpopts->subnormal, fpopts->round, - ftzthreshold, xmin, xmax, xbnd, + ftzthreshold, ofvalue, xmin, xmax, xbnd, leadmask, trailmask, NULL, NULL}; return params; @@ -401,7 +404,7 @@ static inline void UPDATE_LOCAL_PARAMS(const FPTYPE *A, else \ *(x) = FPOF(SIGN(y) | INTOFCONST(p->ftzthreshold)); \ } else if (ABS(y) >= p->xbnd) { /* Overflow */ \ - *(x) = FPOF(SIGN(y) | INTOFCONST(INFINITY)); \ + *(x) = FPOF(SIGN(y) | INTOFCONST(p->ofvalue)); \ } else { \ UPDATE_LOCAL_PARAMS(y, p, lp); \ *(x) = FPOF((INTOF(y) + \ @@ -425,7 +428,7 @@ static inline void UPDATE_LOCAL_PARAMS(const FPTYPE *A, else \ *(x) = FPOF(SIGN(y)); \ } else if (ABS(y) > p->xbnd) { /* Overflow */ \ - *(x) = FPOF(SIGN(y) | INTOFCONST(INFINITY)); \ + *(x) = FPOF(SIGN(y) | INTOFCONST(p->ofvalue)); \ } else { \ UPDATE_LOCAL_PARAMS(y, p, lp); \ *(x) = FPOF((INTOF(y) + (lp->trailmask>>1)) & lp->leadmask); \ @@ -450,7 +453,7 @@ static inline void UPDATE_LOCAL_PARAMS(const FPTYPE *A, else \ *(x) = FPOF(SIGN(y) | INTOFCONST(p->ftzthreshold)); \ } else if (ABS(y) >= p->xbnd) { /* Overflow */ \ - *(x) = FPOF(SIGN(y) | INTOFCONST(INFINITY)); \ + *(x) = FPOF(SIGN(y) | INTOFCONST(p->ofvalue)); \ } else { \ UPDATE_LOCAL_PARAMS(y, p, lp); \ INTTYPE LSB = ((INTOF(y) >> (DEFPREC-lp->precision)) & INTCONST(1)) \ @@ -477,11 +480,11 @@ static inline void UPDATE_LOCAL_PARAMS(const FPTYPE *A, *(x) = *(y) > 0 ? p->ftzthreshold : 0; \ } else if (ABS(y) > p->xmax) { /* Overflow */ \ if (*(y) > p->xmax) \ - *(x) = INFINITY; \ - else if (*(y) < -p->xmax && *(y) != -INFINITY) \ + *(x) = p->ofvalue; \ + else if (*(y) < -p->xmax && *(y) != -p->ofvalue) \ *(x) = -p->xmax; \ else /* *(y) == -INFINITY */ \ - *(x) = -INFINITY; \ + *(x) = -p->ofvalue; \ } else { \ UPDATE_LOCAL_PARAMS(y, p, lp); \ if (SIGN(y) == 0) /* Add ulp if x is positive. */ \ @@ -509,11 +512,11 @@ static inline void UPDATE_LOCAL_PARAMS(const FPTYPE *A, *(x) = *(y) >= 0 ? 0 : -p->ftzthreshold; \ } else if (ABS(y) > p->xmax) { /* Overflow */ \ if (*(y) < -p->xmax) \ - *(x) = -INFINITY; \ - else if (*(y) > p->xmax && *(y) != INFINITY) \ + *(x) = -p->ofvalue; \ + else if (*(y) > p->xmax && *(y) != p->ofvalue) \ *(x) = p->xmax; \ else /* *(y) == INFINITY */ \ - *(x) = INFINITY; \ + *(x) = p->ofvalue; \ } else { \ UPDATE_LOCAL_PARAMS(y, p, lp); \ if (SIGN(y)) /* Subtract ulp if x is positive. */ \ @@ -532,7 +535,7 @@ static inline void UPDATE_LOCAL_PARAMS(const FPTYPE *A, #define RD_TWD_ZERO_SCALAR_OTHER_EXP(x, y, p, lp) \ if (ABS(y) < p->ftzthreshold) { /* Underflow */ \ *(x) = FPOF(SIGN(y)); \ - } else if (ABS(y) > p->xmax && ABS(y) != INFINITY) { /* Overflow */ \ + } else if (ABS(y) > p->xmax && ABS(y) != p->ofvalue) { /* Overflow */ \ *(x) = FPOF(SIGN(y) | INTOFCONST(p->xmax)); \ } else { \ UPDATE_LOCAL_PARAMS(y, p, lp); \ @@ -581,7 +584,7 @@ static inline void UPDATE_LOCAL_PARAMS(const FPTYPE *A, *(x) = *(y); \ } \ if (ABS(x) >= p->xbnd) /* Overflow */ \ - *(x) = FPOF(SIGN(y) | INTOFCONST(INFINITY)); + *(x) = FPOF(SIGN(y) | INTOFCONST(p->ofvalue)); /* Stochastic rounding with equal probabilities. */ #define RS_EQUI_SCALAR(x, y, p, lp) \ @@ -589,9 +592,9 @@ static inline void UPDATE_LOCAL_PARAMS(const FPTYPE *A, if (ABS(y) < p->ftzthreshold && *(y) != 0) { /* Underflow */ \ randombit = GENBIT(p->BITSEED); \ *(x) = FPOF(SIGN(y) | INTOFCONST(randombit ? p->ftzthreshold : 0)); \ - } else if (ABS(y) > p->xmax && ABS(y) != INFINITY) { /* Overflow */ \ + } else if (ABS(y) > p->xmax && ABS(y) != p->ofvalue) { /* Overflow */ \ randombit = GENBIT(p->BITSEED); \ - *(x) = FPOF(SIGN(y) | INTOFCONST(randombit ? INFINITY : p->xmax)); \ + *(x) = FPOF(SIGN(y) | INTOFCONST(randombit ? p->ofvalue : p->xmax)); \ } else if ((INTOF(y) & lp->trailmask)) { /* Not exactly representable. */ \ randombit = GENBIT(p->BITSEED); \ *(x) = FPOF(INTOF(y) & lp->leadmask); \ @@ -604,7 +607,7 @@ static inline void UPDATE_LOCAL_PARAMS(const FPTYPE *A, #define RO_SCALAR(x, y, p, lp) \ if (ABS(y) < p->ftzthreshold && *(y) != 0) { /* Underflow */ \ *(x) = FPOF(SIGN(y) | INTOFCONST(p->ftzthreshold)); \ - } else if (ABS(y) > p->xmax && ABS(y) != INFINITY) { /* Overflow */ \ + } else if (ABS(y) > p->xmax && ABS(y) != p->ofvalue) { /* Overflow */ \ *(x) = FPOF(SIGN(y) | INTOFCONST(p->xmax)); \ } else { \ UPDATE_LOCAL_PARAMS(y, p, lp); \ diff --git a/test/cpfloat_test.m b/test/cpfloat_test.m index abef1f8..b5acdc0 100644 --- a/test/cpfloat_test.m +++ b/test/cpfloat_test.m @@ -20,46 +20,69 @@ pi_h = 6432*uh; % fp16(pi) % Check handling of defaults and persistent variable. - fp.format = 'bfloat16'; [c,options] = cpfloat(pi,fp); + fp.format = 'bfloat16'; + [~,options] = cpfloat(pi,fp); assert_eq(fp.format,options.format) + options.subnormal assert_eq(options.subnormal,0) - fp.format = []; [c,options] = cpfloat(pi,fp); + fp.format = []; + [~,options] = cpfloat(pi,fp); assert_eq(options.format,'h') % Check default; - fp.subnormal = 0; [c,options] = cpfloat(pi,fp); - assert_eq(options.subnormal,0) + fp.explim = []; + [~,options] = cpfloat(pi,fp); + assert_eq(options.explim,1) % Check default. - fp.subnormal = []; [c,options] = cpfloat(pi,fp); - assert_eq(options.subnormal,1) % Check default; + fp.explim = 0; + [~,options] = cpfloat(pi,fp); + assert_eq(options.explim,0) % Check no default. - fp.round = []; [c,options] = cpfloat(pi,fp); + fp.round = []; + [~,options] = cpfloat(pi,fp); assert_eq(options.round,1) % Check default. - fp.flip = []; [c,options] = cpfloat(pi,fp); - assert_eq(options.flip,0) % Check no default. + fp.saturation = 1; + [~,options] = cpfloat(pi,fp); + assert_eq(options.saturation,1) - fp.explim = []; [c,options] = cpfloat(pi,fp); - assert_eq(options.explim,1) % Check default. + fp.saturation = []; + [~,options] = cpfloat(pi,fp); + assert_eq(options.saturation,0) % Check default; - fp.explim = 0; [c,options] = cpfloat(pi,fp); - assert_eq(options.explim,0) % Check no default. + fp.subnormal = 0; + [~,options] = cpfloat(pi,fp); + assert_eq(options.subnormal,0) + + fp.subnormal = []; + [~,options] = cpfloat(pi,fp); + assert_eq(options.subnormal,1) % Check default; + + fp.flip = []; + [~,options] = cpfloat(pi,fp); + assert_eq(options.flip,0) % Check no default. clear cpfloat fp options - fp.flip = 1; [~,options] = cpfloat([],fp); + fp.flip = 1; + [~,options] = cpfloat([],fp); assert_eq(options.format,'h') assert_eq(options.round,1) + assert_eq(options.saturation,0) assert_eq(options.subnormal,1) clear cpfloat fp options % check all default options - fp.format = []; fp.subnormal = []; - fp.round = []; fp.flip = []; + fp.format = []; + fp.round = []; + fp.saturation = []; + fp.subnormal = []; + fp.flip = []; fp.p = []; - [c,options] = cpfloat(pi,fp); + [~,options] = cpfloat(pi,fp); assert_eq(options.format,'h') - assert_eq(options.subnormal,1) assert_eq(options.round,1) + assert_eq(options.saturation,0) + assert_eq(options.subnormal,1) assert_eq(options.flip,0) assert_eq(options.p,0.5) % % Takes different path from previous test since fpopts exists. @@ -71,18 +94,22 @@ clear cpfloat fp fp.flip = 1; fp.format = 'd'; c = ones(8,1); - d = cpfloat(c,fp); assert_eq(norm(d-c,1)>0,true); - d = cpfloat(c',fp); assert_eq(norm(d-c',1)>0,true); + d = cpfloat(c,fp); + assert_eq(norm(d-c,1)>0,true); + d = cpfloat(c',fp); + assert_eq(norm(d-c',1)>0,true); fp.p = 0; % No bits flipped. - d = cpfloat(c,fp); assert_eq(d,d); + d = cpfloat(c,fp); + assert_eq(d,d); fp.p = 1; % All bits flipped. - d = cpfloat(c,fp); assert_eq(all(d ~= c),true); + d = cpfloat(c,fp); + assert_eq(all(d ~= c),true); clear cpfloat [~,fp] = cpfloat; assert_eq(fp.subnormal,1) assert_eq(fp.format,'h') - [c,options] = cpfloat(pi); + [~,options] = cpfloat(pi); assert_eq(options.format,'h') assert_eq(options.subnormal,1) assert_eq(options.round,1) @@ -91,7 +118,7 @@ clear fp fp.format = 'd'; - [c,options] = cpfloat(pi,fp); + [~,options] = cpfloat(pi,fp); assert_eq(options.format,'d') assert_eq(options.subnormal,1) assert_eq(options.params, [53 -1022 1023]) @@ -101,7 +128,19 @@ assert_eq(fp.params, [53 -1022 1023]) clear fp - fp.format = 'bfloat16'; [c,options] = cpfloat(pi,fp); + fp.format = 'E4M3'; + [~,options] = cpfloat(pi,fp); + assert_eq(options.format,'E4M3') + assert_eq(options.saturation,1) + assert_eq(options.params, [4 -6 8]) + [~,fp] = cpfloat; + assert_eq(fp.format,'E4M3') + assert_eq(fp.saturation,1) + assert_eq(fp.params, [4 -6 8]) + + clear fp + fp.format = 'bfloat16'; + [~,options] = cpfloat(pi,fp); assert_eq(options.format,'bfloat16') assert_eq(options.subnormal,0) assert_eq(options.params, [8 -126 127]) @@ -114,7 +153,8 @@ [~,fp] = cpfloat; fp.format = 'b'; fp = rmfield(fp, 'params'); - [c,options] = cpfloat(pi,fp); + [~,options] = cpfloat(pi,fp); + assert_eq(options.saturation,0) % No saturation if that field was empty. assert_eq(options.subnormal,1) % No subnormals only if that field was empty. % Check these usages do not give an error. @@ -134,7 +174,8 @@ B = A + randn(size(A))*1e-12; C = cpfloat(B,options); assert_eq(A,C); - A2 = hilb(6); C = cpfloat(A2); + A2 = hilb(6); + C = cpfloat(A2); options.format = 'c'; options.params = [8 -126 127]; % bfloat16 @@ -152,7 +193,7 @@ [X1,opt] = cpfloat(A,options); [X2,opt2] = cpfloat(A,options2); assert_eq(X1,X2) - % assert_eq(cpfloat(A,options),cpfloat(A,options2)); + assert_eq(cpfloat(A,opt),cpfloat(A,opt2)); % Row vector clear options @@ -185,14 +226,14 @@ elseif i == 2 % Half precision tests. [u,xmins,xmin,xmax,p,emins,emin,emax] = float_params('half'); - options.format = 'h' + options.format = 'h'; elseif i == 3 % Quarter precision tests. [u,xmins,xmin,xmax,p,emins,emin,emax] = float_params('q43'); options.format = 'E4M3'; % Modification for OCP compliant q43. emin = -6; % Previously thought to be 1-emax=-7. - emax = 8; % Previously thought to be 7 + emax = 8; % Previously thought to be 7 emins = emin + 1 - p; % Exponent of smallest subnormal number. xmins = 2^emins; xmin = 2^emin; @@ -282,7 +323,19 @@ assert_eq(c,x) end + % Saturation tests. + options.saturation = 1; + for j = 1:6 + options.round = j; + x = inf; + c = cpfloat(x,options); + assert_eq(c,xmax) + c = cpfloat(-x,options); + assert_eq(c,-xmax) + end + % Infinities tests. + options.saturation = 0; for j = 1:6 options.round = j; x = inf; @@ -345,14 +398,16 @@ end options.subnormal = 1; - c = cpfloat(xmin,options); assert_eq(c,xmin) + c = cpfloat(xmin,options); + assert_eq(c,xmin) x = [xmins xmin/2 xmin 0 xmax 2*xmax 1-delta/5 1+delta/4]; c = cpfloat(x,options); c_expected = [x(1:5) inf 1 1]; assert_eq(c,c_expected) options.subnormal = 0; - c = cpfloat(xmin,options); assert_eq(c,xmin) + c = cpfloat(xmin,options); + assert_eq(c,xmin) x = [xmins xmin/2 xmin 0 xmax 2*xmax 1-delta/5 1+delta/4]; c = cpfloat(x,options); c_expected = [0 0 x(3:5) inf 1 1]; @@ -409,12 +464,24 @@ % Do not limit exponent. options.explim = 0; - x = xmin/2; c = cpfloat(x,options); assert_eq(c,x) - x = -xmin/2; c = cpfloat(x,options); assert_eq(c,x) - x = xmax*2; c = cpfloat(x,options); assert_eq(c,x) - x = -xmax*2; c = cpfloat(x,options); assert_eq(c,x) - x = xmins/2; c = cpfloat(x,options); assert_eq(c,x) - x = -xmins/2; c = cpfloat(x,options); assert_eq(c,x) + x = xmin/2; + c = cpfloat(x,options); + assert_eq(c,x) + x = -xmin/2; + c = cpfloat(x,options); + assert_eq(c,x) + x = xmax*2; + c = cpfloat(x,options); + assert_eq(c,x) + x = -xmax*2; + c = cpfloat(x,options); + assert_eq(c,x) + x = xmins/2; + c = cpfloat(x,options); + assert_eq(c,x) + x = -xmins/2; + c = cpfloat(x,options); + assert_eq(c,x) A = [pi -pi; pi -pi]; C = cpfloat(A,options); options.explim = 1; @@ -498,21 +565,32 @@ % Test rounding with CHOPFAST versus native rounding. options.format = 's'; - m = 100; y = zeros(3,n); z = y; + m = 100; + y = zeros(3,n); + z = y; for i = 1:m x = randn; - options.round = 2; y(i,1) = cpfloat(x,options); - options.round = 3; y(i,2) = cpfloat(x,options); - options.round = 4; y(i,3) = cpfloat(x,options); + options.round = 2; + y(i,1) = cpfloat(x,options); + options.round = 3; + y(i,2) = cpfloat(x,options); + options.round = 4; + y(i,3) = cpfloat(x,options); if usingoctave - fesetround(inf); z(i,1) = single(x); - fesetround(-inf); z(i,2) = single(x); - fesetround(0); z(i,3) = single(x); + fesetround(inf); + z(i,1) = single(x); + fesetround(-inf); + z(i,2) = single(x); + fesetround(0); + z(i,3) = single(x); else % Use undocumented function to set rounding mode in MATLAB. - feature('setround',inf), z(i,1) = single(x); - feature('setround',-inf), z(i,2) = single(x); - feature('setround',0), z(i,3) = single(x); + feature('setround',inf); + z(i,1) = single(x); + feature('setround',-inf); + z(i,2) = single(x); + feature('setround',0); + z(i,3) = single(x); end end assert_eq(y,z) @@ -533,9 +611,15 @@ c = cpfloat(x,options); assert_eq(c,[0 0 x(3:4)]) - options.format = 'd'; options.subnormal = 0; cpfloat([],options); - a = cpfloat(pi); assert_eq(a,pi) - options.format = 'd'; options.subnormal = 1; cpfloat([],options); + options.format = 'd'; + options.subnormal = 0; + cpfloat([],options); + a = cpfloat(pi); + assert_eq(a,pi) + + options.format = 'd'; + options.subnormal = 1; + cpfloat([],options); a = cpfloat(pi); assert_eq(a,pi) x = pi^2; @@ -563,8 +647,10 @@ yd = cpfloat(pd); assert_eq(double(ys),yd) - options.format = 'h'; options.round = 2; - as = single(rand(n,1)); ad = double(as); + options.format = 'h'; + options.round = 2; + as = single(rand(n,1)); + ad = double(as); delta = single(rand(n,1)); cd = cpfloat(ad + 1e-5*double(delta),options); cs = cpfloat(as + 1e-5*delta,options); @@ -581,21 +667,21 @@ % Test base 2 logarithm options.format = 'h'; options.round = 4; - x = single(2^-3 * (sum(2.^(-[0:23])))); - assert_eq(cpfloat(x,options), single(2^-3 * (sum(2.^(-[0:10]))))) + x = single(2^-3 * (sum(2.^(-(0:23))))); + assert_eq(cpfloat(x,options), single(2^-3 * (sum(2.^(-(0:10)))))) - x = 2^-3 * (sum(2.^(-[0:52]))); - assert_eq(cpfloat(x,options), 2^-3 * (sum(2.^(-[0:10])))) + x = 2^-3 * (sum(2.^(-(0:52)))); + assert_eq(cpfloat(x,options), 2^-3 * (sum(2.^(-(0:10))))) options.format = 's'; - x = single(2^-3 * (sum(2.^(-[0:23])))); + x = single(2^-3 * (sum(2.^(-(0:23))))); assert_eq(cpfloat(x,options), x) - x = 2^-3 * (sum(2.^(-[0:52]))); - assert_eq(cpfloat(x,options), 2^-3 * (sum(2.^(-[0:23])))) + x = 2^-3 * (sum(2.^(-(0:52)))); + assert_eq(cpfloat(x,options), 2^-3 * (sum(2.^(-(0:23))))) options.format = 'd'; - x = 2^-3 * (sum(2.^(-[0:52]))); + x = 2^-3 * (sum(2.^(-(0:52)))); assert_eq(cpfloat(x,options), x) options.round = 1; diff --git a/test/cpfloat_test.ts b/test/cpfloat_test.ts index be83030..9665010 100644 --- a/test/cpfloat_test.ts +++ b/test/cpfloat_test.ts @@ -53,10 +53,13 @@ optstruct *fpopts; void fpopts_setup(void) { fpopts = malloc(sizeof(optstruct)); - fpopts->flip = 0; - fpopts->p = 0.5; fpopts->round = 1; fpopts->explim = CPFLOAT_EXPRANGE_STOR; + fpopts->saturation = CPFLOAT_SAT_NO; + + fpopts->flip = 0; + fpopts->p = 0.5; + fpopts->bitseed = NULL; fpopts->randseedf = NULL; fpopts->randseed = NULL;