############################################################################ # Actuarial Modeling in R # CAS Spring Meeting June, 2007 # Jim Guszcza # R code ############################################################################ library(MASS) library(nnet) library(scatterplot3d) set.seed(123) ############################################################################ # Warm-up ############################################################################ #working with vectors a <- 1:3 b <- seq(2,6,by=2) a; b #use ";" to separate two commands a+b; b+a; a-b; b-a #elementary vector arithmetic a*b; a/b; b/a #more arithmetic a**b; a^b #two ways of exponentiating a %*% b #the dot product t(a) %*% b #the dot product length(a); length(b) c <- 5*a + b c; length(c) objects() rm(a,b,c) objects() #working with matricies a <- 1:12 b <- matrix(a, ncol=3) a;b length(a); length(b) dim(a) #a is a vector - therefore "dimensonless" dim(b) #b is a 4-by-3 matrix b[2,] b[,2] ############################################################################ # Example 0 # MLE fit Gamma data ############################################################################ #simultate data from Gamma distribution x <- rgamma(10000, shape=5, rate = .1) summary(x) truehist(x) lines(density(x), col="red", lwd=2) title("Gamma(5, 0.1) distribution") #MLE estimate of gamma parameters fitdistr(x, "gamma") ?fitdistr #on-line manual fitdistr #R is open source! ############################################################################ # Example 1 # Fitting Loss Distribution # # This example will use loss distribution data # from the COTOR Round 2 challenge # (available on the CAS web site) ############################################################################ #dataset 1: losses from CA reserving project #dataset 2: losses from COTOR Round 2 Challenge cotor <- read.csv("cotor.txt", header = FALSE, sep = ",") loss <- cotor[,1] dim(loss); length(loss) truehist(loss) #try taking loss of loss logloss <- log(loss) truehist(logloss) fitdistr(logloss, "gamma") #add the fit log-gamma to the histogram plot fit <- fitdistr(logloss, "gamma") sshape <- fit$estimate[1] rrate <- fit$estimate[2] sshape; rrate lines(density(rgamma(5000,sshape,rrate)), col="red", lwd=2) #add kernel density estimate of the empirical distribution to plot lines(density(logloss), col="blue", lwd=2) title("logloss histogram with kernel density estimate") #start model disgnostics here par(mfrow=c(2,2)) truehist(logloss) lines(density(logloss), col="blue", lwd=2) title("actual data w/ kernel estimate") #create vector of initial values for MLE estimation #p0 <- c(p=mean(logloss<10),mu1=7,sd1=2,mu2=8,sd2=2) mu <- mean(logloss); sd <- sd(logloss) p0 <- c(p=mean(logloss<1),mu1=mu,sd1=sd,mu2=mu,sd2=sd) #this is an R function loglik <- function(p,x) { e <- p[1] * dnorm((x - p[2])/p[3])/p[3] + (1-p[1]) * dnorm((x - p[4])/p[5])/p[5] if(any(e <= 0)) Inf else -sum(log(e)) } #do MLE computation parm <- optim(p0, loglik, x=logloss)$par parm p <- parm[1];mu1 <- parm[2];sd1 <- parm[3];mu2 <- parm[4];sd2 <- parm[5] #pull 10000 draws from the modeled distribution a <- rnorm(p*10000,mean=mu1,sd=sd1) b <- rnorm((1-p)*10000,mean=mu2,sd=sd2) modeled <- c(a,b) truehist(modeled) lines(density(modeled), col="red", lwd=2) title("modeled loss distribution") #superimpose modeled distribution on the actual data truehist(logloss) lines(density(modeled), col="red", lwd=2) title("actual data w/ modeled dist") #diagnostic plot of modeled distribution qqplot(modeled,logloss) abline(0,1, col="blue", lwd=2) title("QQ plot") ############################################################################ # Example 2 # Claim Duration data - non-linear predictive modeling # # Note: this data was simulated by Jim Guszcza # purely for illustrative purposes. # The relationships between the variables are # not designed to be realistic. ############################################################################ dat <- read.csv("claim_duration.txt", header = TRUE, sep = ",") summary(dat); dim(dat) dat[1:10,] cor(dat) #univariate plots par(mfrow=c(2,2)) truehist(dat$age) truehist(dat$dist) truehist(dat$dur) #bi-variate plots pairs(dat) age <- dat$age dist <- dat$dist dur <- dat$dur #tri-variate plot scatterplot3d(age, dist, dur, highlight.3d=TRUE) title("actual data") #fit neural network model set.seed(123) n1 <<- nnet(dur~age+dist, data=dat ,linout=T , size=10, rang=0.1, decay=.5, maxit=400) yhat.n <- predict(n1,dat)[,1] scatterplot3d(dat$age,dat$dist,yhat.n, color="blue") title("neural net predictions") r1 <<- lm(dur~ age + I(age^2) + dist, data=dat) yhat.r <- predict(r1) scatterplot3d(dat$age,dat$dist,yhat.r, color="green") title("regression predictions") #compare the R^2 of both models y <- dat$dur cor(y,yhat.r)^2 cor(y,yhat.n)^2 #compare goodness of fit par(mfrow=c(2,2)) scatterplot3d(age, dist, dur, highlight.3d=TRUE); title("actual") scatterplot3d(dat$age,dat$dist,yhat.n, color="blue"); title("nnet") scatterplot3d(dat$age,dat$dist,yhat.r, color="green"); title("regression") #model diagnostics - compare Regression and Neural Net par(mfrow=c(2,2)) plot(y,yhat.r, ylim=c(0,200)) abline(0,1,col="blue",lw=2) err <- y-yhat.r plot(yhat.r,err,xlim=c(0,200), ylim=c(-100,100)) abline(h=0,col="green",lw=2) mtext("Multiple Regression Model" ,side=3,outer=T,line=-2,col="blue",cex=1.5,font=3) plot(y,yhat.n, ylim=c(0,200)) abline(0,1,col="blue",lw=2) err <- y-yhat.n plot(yhat.n,err,xlim=c(0,200), ylim=c(-100,100)) abline(h=0,col="green",lw=2) mtext("Neural Net Model" ,side=3,outer=T,line=-21,col="blue",cex=1.5,font=3)