# R functions for use with the q-exponential distribution of Tsallis # Copyright (c) 2007, Cosma Shalizi, cshalizi@cmu.edu or cosma.shalizi@gmail.com ##### This is free software; you can redistribute it and/or modify ##### it under the terms of the GNU General Public License as published by ##### the Free Software Foundation; either version 2 of the License, or ##### (at your option) any later version. ##### ##### This software is distributed in the hope that it will be useful, ##### but WITHOUT ANY WARRANTY; without even the implied warranty of ##### MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ##### GNU General Public License for more details. ##### ##### You should have received a copy of the GNU General Public License ##### along with this software; if not, write to the Free Software ##### Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA # Please contact me at the above e-mail addresses for bug reports, questions, # etc. Please also contact me if you use this code in a scientific paper! # This is a special case of the generalized Pareto distribution (type II), but # arguably of independent interest. # The distribution is defined through the upper cumulative or complementary # distribution function, a.k.a. survival function, ### Pr(X>=x) = (1-(1-q)x/kappa)^(1/(1-q)) # It is convenient to introduce a re-parameterization ### shape = -1/(1-q) ### scale = shape*kappa # which makes the relationship to the Pareto clearer, and eases estimation. # Users can employ either parameterization, with shape/scale as the default, # but q/kappa over-riding them. (No warning is given if a user provides both # sets of parameters.) # For the derivation of the maximum likelihood estimators, please see the # manuscript "Maximum Likelihood Estimation for q-Exponential (Tsallis) # Distributions", which should accompany this file; if not it is available from # http://bactra.org/research/tsallis-MLE/, along with the latest version of # this code. The manuscript is also available from # http://arxiv.org/abs/math.ST/0701854 # Functions are divided into four parts: # functions for the distribution itself, conforming to the R standards; # functions to estimate the parameters; # functions which illustrate the numerical accuracy of the implementation; # functions to go from one parametrization to the other. # Functions in this file: # Distribution-related (per R standards): # dtsal Probability density # ptsal Cumulative probability # qtsal Quantiles # rtsal Random variate generation # Parameter Estimation: # tsal.loglike Log-likelihood calculation # tsal.fit Estimate parameters; wrapper for detailed # methods. Call this, not its subroutines # tsal.mle.direct Direct maximization of likelihood # tsal.mle.equation Find MLE by solving estimating equation; default # tsal.est.shape.from.scale MLE of shape parameter given scale parameter # tsal.est.scale.from.shape MLE of scale parameter given shape parameter # tsal.curvefit Find parameters by fitting a curve to the # empirical distribution function; avoid # tsal.bootstrap.errors Bootstrap standard errors, biases for MLE # tsal.fisher Fisher information matrix, for asymptotics # tsal.mean Calculate the expectation value # tsal.total.magnitude Total magnitude of a population (estimated) # Implementation Testing: # plot.tsal.quantile.transform Illustrates relative numerical inaccuracy in # ptsal and qtsal, which should be inverses # plot.tsal.LR.distribution Calculates the log likelihood ratio for # estimated vs. fixed true parameters, and plots # it against the theoretical asymptotic # distribution (chi^2 with 2 d.f.). # Censored Data: # dtsal.tail Probability density (tail-conditional) # ptsal.tail Cumulative probability (tail-conditional) # qtsal.tail Quantiles (tail-conditional) # rtsal.tail Random variate generation (from the tail) # Parameter Conversion: # tsal.shape.from.q Get shape parameter from q # tsal.scale.from.qk Get scale parameter from q, kappa # tsal.q.from.shape Get q from from shape parameter # tsal.kappa.from.ss Get kappa from shape, scale # tsal.ss.from.qk Get shape, scale from q, kappa (as pairs) # tsal.qk.from.ss Get q, kappa from shape, scale (as pairs # HISTORY: # Begun 28 December 2006 # Preliminary version (0.0) finished on 28 January 2007 # Updated version (0.1) finished on 30 January 2007 ##### Changes: ##### 1. added right-tail-conditional versions of d/p/q/r tsal for censored data ##### 2. ordinary d/p/q/r tsal, loglike now take xmin argument, defaulting to 0 ##### 3. MLEs now take xmin argument, defaulting to zero ##### 4. tsal.mle.equation checks whether initial bracket of root makes sense, ##### if not, passes job to tsal.mle.direct (slower but always sensible) ##### 5. tsal.fit records method used ##### 6. tsal.bootstrap.errors takes method argument ##### 7. tsal.bootstrap.errors takes xmin argument ##### 8. added tsal.fisher to calculate Fisher information matrix ##### 9. tsal.fit added "leastsquares" method as alias for "curvefit" # Last modified 30 January 2007 # Updated version (0.2) finished on 5 February 2007 ##### Changes: ##### 1. Lifted constraints in fitting functions that shape and scale be > 0, ##### as negative values correspond to q < 1, and are perfectly OK ##### 2. tsal.mle.equation now uses two brackets, one for positive and one ##### for negative scale; tsal.mle.direct still called if neither works ##### 3. tsal.mle.direct now uses two initial conditions, one for q > 1 and one ##### for q < 1, goes with the higher likelihood ##### 4. dtsal and dtsal.tail now only evaluate log(shape/scale), rather than ##### log(shape) - log(scale) ##### 5. Added tsal.mean to calculate the expectation value ##### 6. Added tsal.total.magnitude to estimate the total size and number of ##### objects in a population sampled from the tail, per Doug White # Updated version (0.2.1) finished on 6 February 2007 ##### 1. Added optional multiplier argument to tsal.total.magnitude # TODO in the future: ##### 1. Calls to d/p/q/r tsal with q=1 should be turfed to the exponential ##### distribution proper ##### 2. Modify return values of tsal.mle.equation and tsal.mle.direct so that ##### when the latter passes off a job to the former, this is reflected in ##### the ultimate "method" component of the list returned by tsal.fit. ##### 3. Issue a warning when going from one fitting method to another ################################################################# ######## DISTRIBUTION-RELATED FUNCTIONS ######### ################################################################# # Calculate the probability density # Input: vector of data values, distributional parameters, left-censoring # threshold, log flag # Output: vector of (log) densities dtsal <- function(x, shape=1,scale=1, q=tsal.q.from.shape(shape), kappa=tsal.kappa.from.ss(shape,scale), xmin=0, log=FALSE) { # If we have both shape/scale and q/kappa parameters, # the latter over-ride. ss <- tsal.ss.from.qk(q,kappa) shape <- ss[1] scale <- ss[2] # Under censoring, pass off to the tail version if (xmin > 0) { return(dtsal.tail(x, shape, scale, q, kappa, xmin, log)) } z <- 1+x/scale if (log) { d <- -(shape+1)*log(z) + log(shape/scale) } else { d <- (shape/scale)*(z^(-shape-1)) } return(d) } # Calculate the cumulative distribution function # Input: vector of data values, distributional parameters, left-censoring # threshold, usual flags # Output: vector of (log) probabilities ptsal <- function(x, shape=1, scale=1, q=tsal.q.from.shape(shape), kappa=tsal.kappa.from.ss(shape,scale), xmin=0, lower.tail=TRUE, log.p=FALSE) { # If we have both shape/scale and q/kappa parameters, # the latter over-ride. ss <- tsal.ss.from.qk(q,kappa) shape <- ss[1] scale <- ss[2] if (xmin > 0) { return(ptsal.tail(x, shape, scale, q, kappa, xmin, lower.tail, log.p)) } z <- 1+x/scale if ((log.p) && (!lower.tail)) { p <- -shape*log(z) } if ((log.p) && (lower.tail)) { p <- log(1-(z^(-shape))) } if ((!log.p) && (!lower.tail)) { p <- z^(-shape) } if ((!log.p) && (lower.tail)) { p <- 1 - z^(-shape) } return(p) } # Calculate quantiles # Input: vector of p-values, distributional parameters, left-censoring # threshold, usual flags # Output: vector of quantile locations qtsal <- function(p, shape=1, scale=1, q=tsal.q.from.shape(shape), kappa=tsal.kappa.from.ss(shape,scale), xmin=0, lower.tail=TRUE, log.p=FALSE) { # If we have both shape/scale and q/kappa parameters, # the latter over-ride. ss <- tsal.ss.from.qk(q,kappa) shape <- ss[1] scale <- ss[2] if (xmin > 0) { return(qtsal.tail(p, shape, scale, q, kappa, xmin, lower.tail, log.p)) } if (log.p) { p <- exp(p) } if (lower.tail) { p <- 1-p } # The upper quantile function is given by # (scale)(p^{-1/shape} - 1) = x quantiles <- scale*(-1 + (p^(-1/shape))) return(quantiles) } # Generate random variates # Input: integer length, distributional parameters, left-censoring threshold # Output: vector of reals rtsal <- function(n, shape=1, scale=1, q=tsal.q.from.shape(shape), kappa=tsal.kappa.from.ss(shape,scale), xmin=0) { # If we have both shape/scale and q/kappa parameters, the latter over-ride. ss <- tsal.ss.from.qk(q,kappa) shape <- ss[1] scale <- ss[2] if (xmin > 0) { return(rtsal.tail(n, shape, scale, q, kappa, xmin)) } # Apply the transformation method ru <- runif(n) r <- qtsal(ru,shape,scale) return(r) } ################################################################# ######## PARAMETER ESTIMATION FUNCTIONS ######### ################################################################# # Calculate log-likelihood # If a left-censoring threshold is set, and any of the data are below it, # this should produce NA, probably with an error message # Input: Data vector, distributional parameters, left-censoring threshold # Output: log likelihood tsal.loglike <- function(x, shape, scale, q=tsal.q.from.shape(shape), kappa=tsal.kappa.from.ss(shape,scale), xmin=0) { # If we have both shape/scale and q/kappa parameters, the latter over-ride. ss <- tsal.ss.from.qk(q,kappa) shape <- ss[1] scale <- ss[2] l <- sum(dtsal(x,shape=shape,scale=scale,xmin=xmin,log=TRUE)) return(l) } # Fit a q-exponential to data # Wrapper for the actual methods: # tsal.fit.mle.equation (solve maximum likelihood estimating equations); # tsal.fit.mle.direct (numerical likelihood maximization); and # tsal.fit.leastsquares (least-squares curve-fitting to the empirical # distribution); # prettying up the results in all cases. # Users should call this function with the optional method argument, NOT the # subsidiary functions. Allowed values are the extensions of the functions # listed above, plus "curvefit" as an alias to leastsquares. # The likelihood methods are preferred, since the MLE is consistent and # asymptotically unbiased and efficient. The default is to solve the # transcendental estimating equations numerically, as it is by far the fastest # method. Occasionally, however, R's solver gives weird results, with huge # shape/scale parameters, and then direct optimization of the likelihood can be # superior. # The curve-fitting method is disrecommended, and included solely for purposes # of "backwards compatibility" with the practice of the statistical physics # community. # Input: Data, left-censoring threshold, method tag, optional arguments to # optimization routines (if invoked) # Output: A list having as components the shape/scale and q/kappa parameters, # plus some information about the fit and fitting controls tsal.fit <- function(x,xmin=0,method="mle.equation",...) { switch(method, mle.equation = { scale.est <- tsal.mle.equation(x,xmin) }, mle.direct = { scale.est <- tsal.mle.direct(x,xmin,...) }, curvefit = { scale.est <- tsal.fit.leastsquares(x,...) }, leastsquares = { scale.est <- tsal.fit.leastsqures(x,...) }, { cat("Unknown method ", method, "\n"); scale.est <- NA } ) if (!is.na(scale.est)) { shape.est <- tsal.est.shape.from.scale(x,scale.est,xmin=xmin) qk <- tsal.qk.from.ss(shape.est,scale.est) q.est <- qk[1] kappa.est <- qk[2] loglike <- tsal.loglike(x,shape=shape.est,scale=scale.est,xmin=xmin) fit <- list(type="tsal", q=q.est, kappa=kappa.est, shape=shape.est, scale=scale.est, loglike = loglike, n = length(x), xmin=xmin, method=method) } else { fit <- NA } return(fit) } # Fit a q-exponential to data by solving the likelihood estimating equations # Returns ONLY the scale parameter, users should always call tsal.mle, which # does everything else # Uses the basic R routine for finding the roots of a one-variable equation, # which requires an initial bracket on the values; if the defaults don't work, # I have not been clever enough to come up with a good bracketing procedure, # and several crude ones give very bad results, so I give up and pass the # problem to the numerical optimnization routine tsal.mle.direct, which is # slow but always seems to give reasonable-looking results. # Input: Data vector, left-censoring threshold # Output: estimate of scale parameter tsal.mle.equation <- function(x,xmin=0) { n <- length(x) # Set up the function so that it handles the case with xmin possibly != 0 # This form derived from the equation in the paper by multiplying # numerators and denominators by the scale factor, and multiplying # through by n, all in the name of minimizing division g <- function(scale) { shape <- tsal.est.shape.from.scale(x,scale,xmin) z <- x+scale y <- x/z # should work via recycling rule return(n - (shape+1)*sum(y) + n*shape*xmin/(scale+xmin)) } # Initial guesses as to brackets of the root # For positive shape, or q > 1 lower.p <- 1e-12 # Zero doesn't quite work here upper.p <- max(x) # For negative shape, or q < 1 lower.m <- -100*max(x) upper.m <- -max(x)-1 # Has to be strictly less than -max(x) # Check whether the brackets make sense # Don't evaluate the function multiple times g.low.p <- g(lower.p) g.up.p <- g(upper.p) g.low.m <- g(lower.m) g.up.m <- g(upper.m) tried.p <- FALSE tried.m <- FALSE # The brackets make sense, solve if ((sign(g.low.p) != sign(g.up.p)) && (g.up.p!=0) && (g.low.p!=0)) { scale.est.p <- uniroot(g,lower=lower.p,upper=upper.p)$root tried.p <- TRUE } if ((sign(g.low.m) != sign(g.up.m)) && (g.up.m!=0) && (g.low.m!=0)) { scale.est.m <- uniroot(g,lower=lower.m,upper=upper.m)$root tried.m <- TRUE } # If the brackets don't work, don't bother trying to fix them, just optimize # the likelihood if ((!tried.p) && (!tried.m)) { return(tsal.mle.direct(x,xmin)) } # Otherwise, if at least one bracket worked, pick the best if ((tried.p) && (!tried.m)) { scale.est <- scale.est.p } if ((!tried.p) && (tried.m)) { scale.est <- scale.est.m } if ((tried.p) && (tried.m)) { # We don't ever seem to go here, but I'm not sure we can't shape.est.p <- tsal.est.shape.from.scale(x,scale.est.p,xmin) shape.est.m <- tsal.est.shape.from.scale(x,scale.est.m,xmin) lp <- tsal.loglike(x,,shape.est.p,scale.est.p,xmin=xmin) lm <- tsal.loglike(x,shape.est.m,scale.est.m,xmin=xmin) if (lm > lp) { scale.est <- scale.est.m } else { scale.est <- scale.est.p } } # Otherwise, use those brackets to find a root return(scale.est) } # Fit a q-exponential to data by numerically maximizing the likelihood # Returns ONLY the scale parameter; users should always call tsal.mle, which # does everything else # Input: data, left-censoring threshold, optional arguments to maximizer # Output: estimate of scale parameter tsal.mle.direct <- function(x,xmin,...) { l <- function(t) {-tsal.loglike(x,shape=t[1],scale=t[2],xmin=xmin)} # Starting parameter values for positive shape, or q>1 t0.plus <- c(1,max(x)+1) # Starting parameter values for negative shape, or q<1 t0.minus <- c(-1,-max(x)-1) est.plus <- optim(par=t0.plus, fn=l,gr=NULL,...) est.minus <- optim(par=t0.minus, fn=l,gr=NULL,...) scale.est <- est.plus$par[2] if (est.minus$value < est.plus$value) { scale.est <- est.minus$par[2] } return(scale.est) } # Compute MLE of a q-exponential's shape, assuming scale is known # Input: Data vector, scale parameter, left-censoring threshold # Output: Real-valued shape estimate tsal.est.shape.from.scale <- function(x,scale,xmin=0) { n <- length(x) z <- (scale+x)/(scale+xmin) shape.est <- n/sum(log(z)) return(shape.est) } # Compute MLE of a q-exponential's scale, assuming shape is known # Input: Data vector, scale, left-censoring threshold # Output: Real-valued scale estimate tsal.est.scale.from.shape <- function(x,shape,xmin=0) { n <- length(x) # Set up the function so that it handles the case with xmin possible != 0 # This form derived from the equation in the paper by multiplying # numerators and denominators by the scale factor, and multiplying # through by n, all in the name of minimizing division f <- function(scale) { z <- scale+x y <- x/z # should work via recycling rule return(n-(shape+1)*sum(y) + n*shape*xmin/(scale+xmin)) } lower <- 0 upper <- max(x)*(shape+1) scale.est <- uniroot(f,lower,upper)$root return(scale.est) } # Estimate the parameters of a q-exponential distribution by least-squares # curve-fitting to the empirical cumulative distribution function # Returns ONLY the scale parameter, users should always call tsal.mle, which # does everything else # THEORY: # The survival function is ### Pr(X>=x) = (1+x/scale)^(-shape) # therefore ### log(Pr(X>=x)) + shape*log(1+x/scale) = 0 # Writing S_n for the empirical survival function, the least-squares solution # to ### sum_{i}{(log(S_n(x_i)) + shape*log(1+x_i/scale))^2} # COULD be used to estimate the parameters. # This seems to be the current (2007) best practice in statistical physics. # However, it is, or ought to be, well-known that the analogous estimator for # the Pareto distribution is highly inaccurate, and inferior to the maximum # likelihood estimator. The same is true for q-exponential distributions. # This function is only for comparison and "backward compatibility", and I # strongly recommend against using it. # The default optimization algorithm is Nelder-Mead. The search algorithm and # other aspects of the optimization can be changed by passing in optional # arguments. (See the R help on "optim" for details.) # Inputs: Data vector, optional arguments for the optimizer # Outputs: Real-valued estimate of the scale parameter tsal.fit.leastsquares <- function(x, ...) { n <- length(x) Fn <- ecdf(x) k <- knots(Fn) Sn <- 1 + (1/n) - Fn(k) # Avoids the zero value at the last point f <- function(theta) { sum((log(Sn) + theta[1]*log(1+k/theta[2]))^2) } theta_0 <- c(1,1) # Arbitrary, but we've got to start somewhere! scale.est <- optim(par=theta_0,f=f,grad=NULL,...)$par[2] return(scale.est) } # Find biases and standard errors for parameter estimates by parametric # bootstrapping, and simple confidence intervals # Simulate, many times, drawing samples from the estimated distribution, of # the same size as the original data; re-estimate the parameters on the # simulated data. The distribution of the re-estimates around the estimated # parameters is approximately the same as the distribution of the estimate # around the true parameters. # Invokes the estimating-equation MLE, but it would be easy to modify to # use other methods. # Confidence intervals (CI) are calculated for each parameter separately, using # a simple pivotal interval (see, e.g., Wasserman, _All of Statistics_, section # 8.3). Confidence regions for combinations of parameters would be a tedious, # but straightforward, extension. # Inputs: distribution (as a list of the sort produced by tsal.fit), number of # bootstrap replicates, confidence level for confidence intervals, # distributional parameters (over-riding those of the distribution, if # one was given), fitting method (over-riding that used in the original # fit, if one was given), left-censoring threshold (over-riding, etc.) # Outputs: structured list, containing the actual parameter settings used, # the estimated biases, the estimated standard errors, the lower # confidence limits, the upper confidence limits, the sample size, the # number of replicates, the confidence level, and the fitting method. tsal.bootstrap.errors <- function(dist=NULL, reps=500, confidence=0.95, n=if(is.null(dist)) 1 else dist$n, shape=if(is.null(dist)) 1 else dist$shape, scale=if(is.null(dist)) 1 else dist$scale, q = if(is.null(dist)) tsal.q.from.shape(shape) else dist$q, kappa = if(is.null(dist)) tsal.kappa.from.ss(shape,scale) else dist$kappa, method = if(is.null(dist)) mle.equation else dist$method, xmin = if(is.null(dist)) 0 else dist$xmin) { # If we have both shape/scale and q/kappa parameters, the latter over-ride. ss <- tsal.ss.from.qk(q,kappa) shape <- ss[1] scale <- ss[2] theta <- c(shape,scale,q,kappa) delta <- (1-confidence)/2 # probability of being in either tail, for CI CI.probs <- c(delta, 1-delta) # upper and lower probbilities for CI Bootstrap <- replicate(reps, { x<-rtsal(n,shape,scale,xmin=xmin); est <- tsal.fit(x,xmin=xmin,method=method); c(est$shape, est$scale, est$q, est$kappa) }) Bootstrap <- t(Bootstrap) # Make the columns the different variables Bootstrap.err <- t(apply(Bootstrap,1,"-",theta)) # Subtract theta from each # row Bootstrap.bias <- apply(Bootstrap.err, 2, mean) # i.e. means of columns Bootstrap.se <- apply(Bootstrap.err, 2, sd) # std. devs. of columns Bootstrap.quantiles <- apply(Bootstrap.err,2,quantile,CI.probs) # quantiles # corresponding to upper and lower confidence limits, by columns CI.lower <- theta + Bootstrap.quantiles[1,] # lower confidence limits CI.upper <- theta + Bootstrap.quantiles[2,] # upper confidence limits namelist <- c("shape","scale","q","kappa") names(theta) <- namelist names(Bootstrap.bias) <- namelist names(Bootstrap.se) <- namelist names(CI.lower) <- namelist names(CI.upper) <- namelist return(list(originals=theta, bias=Bootstrap.bias, se=Bootstrap.se, confidence.interval.lower=CI.lower, confidence.interval.upper=CI.upper, sample.size=n, bootrap.replicates=reps, confidence=confidence, method=method, xmin=xmin)) } # Calculate the Fisher information matrix, for asymptotic variances and # covariances of the maximum likelihood estimates of shape and scale # First row/column corresponds to shape, second to scale # Convergence to the asymptotic normal distribution can be slow, so for limited # data you should bootstrap # Note that this function ONLY works with the shape-scale parameterization # Inputs: shape, scale, left-censoring threshold # Outputs: a matrix tsal.fisher <- function(shape, scale, xmin=0) { # If xmin > 0, then effectively the scale = scale+xmin (see paper) scale <- scale+xmin shape.shape <- scale^(-2) shape.scale <- -1/(scale*(shape+1)) scale.scale <- shape/(scale^2 * (shape+2)) fisher <- rbind(c(shape.shape,shape.scale),c(shape.scale,scale.scale)) return(as.matrix(fisher)) } # Calculate the expectation value # Input: Distributional parameters # Output: The mean of that distribution tsal.mean <- function(shape, scale, q=tsal.q.from.shape(shape), kappa=tsal.kappa.from.ss(shape,scale)) { # If we have both shape/scale and q/kappa parameters, the latter over-ride. ss <- tsal.ss.from.qk(q,kappa) shape <- ss[1] scale <- ss[2] mu <- scale/(shape-1) return(mu) } # Estimate the total magnitude of a tail-sampled population # Per Doug White's request # Given that we have n samples from the tail of a distribution, i.e., only # values >= xmin were retained, provide an estimate of the total magnitude # (summed values) of the population # Estimate the number of objects, observed and un-observed, as n/pr(X >= xmin) # and then multiply by the mean # Input: Distribution of the type produced by tsal.fit, distributional # parameters (over-riding the distribution if provided), number of # tail samples (over-riding the distribution if provided), xmin (over- # riding the one recorded in the distribution, if provided), # multiplier of size (if the base units of the data are not real units) # Output: List, giving estimated total magnitude and estimated total population # size tsal.total.magnitude <- function(dist=NULL, n=if(is.null(dist)) 1 else dist$n, shape=if(is.null(dist)) 1 else dist$shape, scale=if(is.null(dist)) 1 else dist$scale, q = if(is.null(dist)) tsal.q.from.shape(shape) else dist$q, kappa = if(is.null(dist)) tsal.kappa.from.ss(shape,scale) else dist$kappa, xmin = if(is.null(dist)) 0 else dist$xmin, mult = 1) { # If we have both shape/scale and q/kappa parameters, the latter over-ride. ss <- tsal.ss.from.qk(q,kappa) shape <- ss[1] scale <- ss[2] p.tail <- ptsal(xmin,shape=shape,scale=scale,lower.tail=FALSE) n.total <- n/p.tail mu <- tsal.mean(shape,scale) mag.total <- n.total * mu * mult totals <- list(magnitude.est = mag.total, count.est = n.total) return(totals) } ######################################################################### ######## IMPLEMENTATION-TESTING FUNCTIONS ######### ######################################################################### # Visually check the accuracy of quantile/inverse quantile transformations # For all parameter values and all x, qtsal(ptsal(x)) should = x. # This function displays the relative error in the transformation, which is due # to numerical imprecision. This indicates roughly how far off random variates # generated by the transformation method will be. # If everything is going according to plan, the curve plotted should oscillate # extremely rapidly between positive and negative limits which, while growing, # stay quite small in absolute terms, e.g., on the order of 1e-5 when # x is on the order of 1e9. # Inputs: Lower limit for x, upper limit for x, distributional parameters, # number of points at which to evaluate the function over the domain, # line width, left-censoring threshold, optional graphics parameters # Outputs: None # Side effects: Plot of relative error, [qtsal(ptsal(x)) - x]/x plot.tsal.quantile.transform <- function(from=0, to=1e6, shape=1, scale=1, q=tsal.q.from.shape(shape), kappa=tsal.kappa.from.ss(shape,scale), n=1e5, lwd=0.01, xmin=0, ...) { # If we have both shape/scale and q/kappa parameters, the latter over-ride. ss <- tsal.ss.from.qk(q,kappa) shape <- ss[1] scale <- ss[2] # Sanity-check from argument if (from < xmin) { from <- xmin } f <- function(x) { (qtsal(ptsal(x,shape,scale,xmin=xmin),shape,scale,xmin=xmin) - x)/x } curve(f(x),from=from,to=to,n=n,lwd=lwd,ylab="relative error",...) invisible() } # Likelihood estimation accuracy check # 2*[(Likelihood at estimated parameters) - (likelihood at true parameters)] # should have a chi^2 distribution with 2 degrees of freedom, at least # asymptotically for large samples # Inputs: Number of sample points, number of replicates, distributional # parameters, left-censoring threshold # Outputs: Results of Kolmgorov-Smirnov test against theoretical distribution # Side effects: Plot of the empirical CDF of likelihood ratios, decorated # with the chi^2_2 CDF plot.tsal.LR.distribution <- function(n=100, reps=100, shape=1, scale=1, q=tsal.q.from.shape(shape), kappa=tsal.kappa.from.ss(shape,scale), xmin=0,method="mle.equation",...) { # If we have both shape/scale and q/kappa parameters, the latter over-ride. ss <- tsal.ss.from.qk(q,kappa) shape <- ss[1] scale <- ss[2] LR <- function(r) {tsal.fit(r,xmin=xmin,method=method)$loglike - tsal.loglike(r,shape=shape, scale=scale,xmin=xmin)} LRs <- replicate(reps, LR(rtsal(n,shape,scale,xmin=xmin))) label.main <- "Sample distribution of the log-likelihood ratio" label.y <- "Cumulative probability" label.x <- "Log likelihood ratio" plot(ecdf(2*LRs), main=label.main, xlab=label.x, ylab=label.y,...) curve(pchisq(x,2), col="red", add=TRUE) ks.test(2*LRs, pchisq, 2) } ################################################################# ######## CENSORED DATA FUNCTIONS ######### ################################################################# # Users will have little reason to call these directly, instead of using # the higher-level ones with the optional xmin argument # Calculate the probability density, conditional on being in the right tail # Users should call dtsal with the xmin argument, not this # Outputs NA at values below the threshold # Input: vector of data values, distributional parameters, log flag # Output: vector of (log) densities dtsal.tail <- function(x, shape=1,scale=1, q=tsal.q.from.shape(shape), kappa=tsal.kappa.from.ss(shape,scale), xmin=0, log=FALSE) { # If we have both shape/scale and q/kappa parameters, # the latter over-ride. ss <- tsal.ss.from.qk(q,kappa) shape <- ss[1] scale <- ss[2] # Default to the non-tail version when xmin==0 # but this function should never be called with xmin==0 if(xmin==0) { return(dtsal(n,shape,scale,q,kappa,log)) } z <- 1+x/scale C <- 1/ptsal(xmin,shape,scale,lower.tail=FALSE) C.log <- -ptsal(xmin,shape,scale,lower.tail=FALSE,log.p=TRUE) # Possible later modification: Check whether C is so small that ONLY C.log # should be reliably used. ("slide-rule" subroutine?) if (log) { f <- function(z) { C.log - (shape+1)*log(z) + log(shape/scale) } } else { f <- function(z) {C*(shape/scale)*(z^(-shape-1)) } } d <- ifelse(x < xmin, NA, f(z)) return(d) } # Calculate the cumulative distribution function, conditional on being in the # right tail # Users should call ptsal with the xmin argument, not this # Outputs NA at values below the threshold # xmin argument # Input: vector of data values, distributional parameters, usual flags # Output: vector of (log) probabilities ptsal.tail <- function(x, shape=1, scale=1, q=tsal.q.from.shape(shape), kappa=tsal.kappa.from.ss(shape,scale), xmin=0, lower.tail=TRUE, log.p=FALSE) { # If we have both shape/scale and q/kappa parameters, # the latter over-ride. ss <- tsal.ss.from.qk(q,kappa) shape <- ss[1] scale <- ss[2] # Default to the non-tail version when xmin==0 # but this function should never be called with xmin==0 if(xmin==0) { return(ptsal(n,shape,scale,q,kappa,lower.tail,log.p)) } C <- 1/ptsal(xmin,shape,scale,lower.tail=FALSE) C.log <- -ptsal(xmin,shape,scale,lower.tail=FALSE,log.p=TRUE) z <- 1+x/scale if ((log.p) && (!lower.tail)) { f <- function(z) { C.log -shape*log(z) } } if ((log.p) && (lower.tail)) { f <- function(z) { log(1-(C*z^(-shape))) } } if ((!log.p) && (!lower.tail)) { f <- function(z) { C*z^(-shape) } } if ((!log.p) && (lower.tail)) { f <- function(z) { 1 - C*z^(-shape) } } p <- ifelse(x < xmin, NA, f(z)) return(p) } # Calculate quantiles, conditional on being in the right tail # Users should call qtsal with the xmin argument, not this # The tail-conditional quantile function is related to the usual one by # Q_tail(p) = Q(1 - p S(xmin)) # where Q is the usual quantile function and S is the UNCONDITIONAL # survival function # Input: vector of p-values, distributional parameters, threshold, usual flags # Output: vector of quantile locations qtsal.tail <- function(p, shape=1, scale=1, q=tsal.q.from.shape(shape), kappa=tsal.kappa.from.ss(shape,scale), xmin=0, lower.tail=TRUE, log.p=FALSE) { # If we have both shape/scale and q/kappa parameters, # the latter over-ride. ss <- tsal.ss.from.qk(q,kappa) shape <- ss[1] scale <- ss[2] # Default to the non-tail version when xmin=0 # but this function should never be called with xmin==0 if(xmin==0) { return(qtsal(n,shape,scale,q,kappa,lower.tail,log.p)) } # Transform the probabilities, if necessary if (log.p) { p <- exp(p) } if (lower.tail) { p <- 1-p } # The right-tail-conditional quantile function, Q_tail, is related to the # usual one via # Q_tail(p) = Q(1 - p S(xmin)) # where S is the UNCONDITIONAL survival function # Let's compute that constant; only let's do it's log, because it can be very # small C.log <- ptsal(xmin, shape, scale, lower.tail=FALSE, log.p=TRUE) # Now multiply it by the probabilities p.multiplied <- exp(C.log + log(p)) # This possibly undoes some of the work under log.p above # Now invoke the unconditional quantile function quantiles <- qtsal(1-p.multiplied,shape,scale) return(quantiles) } # Generate random variates from the right tail # Users should call rtsal with the xmin argument, not this # Input: integer length, distributional parameters, threshold # Output: vector of reals rtsal.tail <- function(n, shape=1, scale=1, q=tsal.q.from.shape(shape), kappa=tsal.kappa.from.ss(shape,scale), xmin=0) { # If we have both shape/scale and q/kappa parameters, the latter over-ride. ss <- tsal.ss.from.qk(q,kappa) shape <- ss[1] scale <- ss[2] # Default to the non-tail version when xmin==0 # but this function should never be called with xmin==0 if(xmin==0) { return(rtsal(n,shape,scale)) } # Apply the transformation method ru <- runif(n) r <- qtsal.tail(ru,shape,scale,xmin=xmin) return(r) } ################################################################# ######## PARAMETER CONVERSION FUNCTIONS ######### ################################################################# # Users will generally have little reason to invoke these functions # Calculate generalized-Pareto shape parameter from q parameter # Input: a real value # Output: a real value tsal.shape.from.q <- function(q) { shape <- -1/(1-q) return(shape) } # Calculate generalized-Pareto scale parameter from q and kappa parameters # Input: two real values # Output: a real value tsal.scale.from.qk <- function(q,kappa) { shape <- tsal.shape.from.q(q) scale <- shape*kappa return(scale) } # Calculate q parameter from shape parameter # Input: a real value # Output: a real value tsal.q.from.shape <- function(shape) { q <- 1+1/shape return(q) } # Calculate kappa parameter from shape and scale parameters # Input: two real values # Output: a real value tsal.kappa.from.ss <- function(shape,scale) { kappa <- scale/shape return(kappa) } # Calculate shape & scale parameters from q & kappa parameters # Input: two real values # Output: vector of two real values tsal.ss.from.qk <- function(q,kappa) { ss <- c(tsal.shape.from.q(q),tsal.scale.from.qk(q,kappa)) return(ss) } # Calculate q & kappa parameters from shape & scale parameters # Input: two real values # Output: vector of two real values tsal.qk.from.ss <- function(shape,scale) { qk <- c(tsal.q.from.shape(shape),tsal.kappa.from.ss(shape,scale)) return(qk) }