############################################################################ # Actuarial Modeling in R # CAS Predictive Modeling Seminar, October 2007 # Author: Jim Guszcza # Sample R code ############################################################################ library(MASS) library(nnet) library(scatterplot3d) library(splines) set.seed(123) ############################################################################ # Warm-up ############################################################################ #working with vectors a <- 1:3 b <- seq(2,6,by=2) a; b #use ";" to separate two commands a+b; b+a; a-b; b-a #elementary vector arithmetic a*b; a/b; b/a #more arithmetic a**b; a^b #two ways of exponentiating a %*% b #the dot product t(a) %*% b #the dot product length(a); length(b) c <- 5*a + b c; length(c) objects() rm(a,b,c) objects() #working with matricies a <- 1:12 b <- matrix(a, ncol=3) a;b length(a); length(b) dim(a) #a is a vector - therefore "dimensonless" dim(b) #b is a 4-by-3 matrix b[2,] b[,2] ############################################################################ # Example 0 # MLE fit Gamma data ############################################################################ #simultate data from Gamma distribution x <- rgamma(10000, shape=5, rate = .1) summary(x) truehist(x) lines(density(x), col="red", lwd=2) title("Gamma(5, 0.1) distribution") #MLE estimate of gamma parameters fitdistr(x, "gamma") ?fitdistr #on-line manual fitdistr #R is open source! ############################################################################ # Example 1 # Fitting Loss Distribution to COTOR challenge data # http://www.casact.org/cotor/index.cfm?fa=round2 ############################################################################ #dataset: losses from COTOR Round 2 Challenge cotor <- read.csv("cotor.txt", header = FALSE, sep = ",") loss <- cotor[,1] truehist(loss) logloss <- log(loss) truehist(logloss) fitdistr(logloss, "gamma") #add the fit log-gamma to the histogram plot fit <- fitdistr(logloss, "gamma") sshape <- fit$estimate[1] rrate <- fit$estimate[2] sshape; rrate lines(density(rgamma(5000,sshape,rrate)), col="red", lwd=2) #add kernel density estimate of the empirical distribution to plot lines(density(logloss), col="blue", lwd=2) title("logloss histogram with kernel density estimate") #start model disgnostics here par(mfrow=c(2,2)) truehist(logloss) lines(density(logloss), col="blue", lwd=2) title("actual data w/ kernel estimate") #create vector of initial values for MLE estimation #p0 <- c(p=mean(logloss<10),mu1=7,sd1=2,mu2=8,sd2=2) mu <- mean(logloss); sd <- sd(logloss) p0 <- c(p=mean(logloss<1),mu1=mu,sd1=sd,mu2=mu,sd2=sd) #this is an R function loglik <- function(p,x) { e <- p[1] * dnorm((x - p[2])/p[3])/p[3] + (1-p[1]) * dnorm((x - p[4])/p[5])/p[5] if(any(e <= 0)) Inf else -sum(log(e)) } #do MLE computation parm <- optim(p0, loglik, x=logloss)$par parm p <- parm[1];mu1 <- parm[2];sd1 <- parm[3];mu2 <- parm[4];sd2 <- parm[5] #pull 10000 draws from the modeled distribution a <- rnorm(p*10000,mean=mu1,sd=sd1) b <- rnorm((1-p)*10000,mean=mu2,sd=sd2) modeled <- c(a,b) truehist(modeled) lines(density(modeled), col="red", lwd=2) title("modeled loss distribution") #superimpose modeled distribution on the actual data truehist(logloss) lines(density(modeled), col="red", lwd=2) title("actual data w/ modeled dist") #diagnostic plot of modeled distribution qqplot(modeled,logloss) abline(0,1, col="blue", lwd=2) title("QQ plot") ############################################################################ # Example 2 # fitting a non-linear pattern ############################################################################ #simulate sine curve data set.seed(123) nnum <- 1000 x <- runif(nnum,min=1,max=100) x <- round(x) x <- sort(x) e <- rnorm(nnum,mean=0,sd=.4) #random error term y <- sin(2*pi*x/100) + e #data(gam.data) #dd <- gam.data[order(gam.data$x),] #y <- dd$y; x <- dd$x #visualize data; discern patterns plot(x,y,type="p",col="blue") lines(lowess(y~x, f=2/3), col="red", lwd=2) # refine the lowess window plot(x,y,type="p",col="lightgrey") lines(lowess(y~x, f=2/3), col="green", lwd=2) lines(lowess(y~x, f=1/3), col="navy", lwd=2) lines(lowess(y~x, f=1/10), col="magenta", lwd=2) lines(lowess(y~x, f=1/50), col="yellow3", lwd=2) legend("topright",c("f=2/3","f=1/3","f=1/10","f=1/50"), col=c("green","navy","magenta","yellow3"),lwd=c(2,2,2,2)) title(main="locally weighted regressions", col.main="blue", cex.main=1.5, font.main=3) #fit simple linear regression to the data r1 <- lm(y~x) summary(r1) yhat.r1 <- predict(r1) #fit spline regression model basis <- ns(x, df=3) x1 <- basis[,1] x2 <- basis[,2] x3 <- basis[,3] par(mfrow=c(2,2)) plot(x1~x); plot(x2~x);plot(x3~x) r2 <- lm(y ~ x1 +x2 + x3) summary(r2) yhat.r2 <- predict(r2) plot(x,y,type="p",col="lightgrey") lines(lowess(y~x, f=1/3), col="navy", lwd=2) #lowes lines(x,yhat.r1,type="l",col="red",lwd=2,lty=2) #simple regression lines(x,yhat.r2,type="l",col="violet",lwd=2) #spline regression legend("topright",c("lowess","regression","spline"), col=c("navy","red","violet","steelblue"), lwd=c(2,2,2),lty=c(1,2,1)) title(main="non-linear curve-fitting problem", col.main="blue", cex.main=1.5, font.main=3) # compare R^2 of different models cor(y,yhat.r1)^2 cor(y,yhat.r2)^2 ############################################################################ # Example 3 # Claim Duration data - non-linear predictive modeling # Note - this is simulated data ############################################################################ dat <- read.csv("claim_duration.txt", header = TRUE, sep = ",") summary(dat); dim(dat) dat[1:10,] cor(dat) #univariate plots par(mfrow=c(2,2)) truehist(dat$age) truehist(dat$dist) truehist(dat$dur) #bi-variate plots pairs(dat) attach(dat) #tri-variate plot par(mfrow=c(1,1)) scatterplot3d(age, dist, dur, highlight.3d=TRUE) title("actual data") scatterplot3d(age, dist, dur, highlight.3d=TRUE, angle=130) detach(dat) #investigate relationship between distance, duration dat <- dat[order(dat$dist),] plot(dur ~ dist, data=dat, main="Distance, Duration Relationship") lines(lowess(dat$dur~dat$dist, f=2/3), col="blue", lwd=2) yhat.a <- predict(lm(dur~dist + I(dist^2),data=dat)) yhat.b <- predict(lm(dur~dist,data=dat)) yhat.c <- predict(lm(dur~ns(dist,df=2),data=dat)) lines(dat$dist,yhat.a, col="green", type="l", lwd=2) lines(dat$dist,yhat.b, col="red", type="l", lwd=1) lines(dat$dist,yhat.c, col="violet", type="l", lwd=2) legend(20, 250, c("lowess","linear","quadratic","spline"), lwd= c(2,1,2,2), col =c("blue","red","green","violet")) #investigate relationship between age, duration dat <- dat[order(dat$age),] plot(dur ~ age, data=dat, main="Age, Duration Relationship") lines(lowess(dat$dur~dat$age, f=1/3), col="blue", lwd=2) yhat.a <- predict(lm(dur~age + I(age^2),data=dat)) yhat.b <- predict(lm(dur~age,data=dat)) yhat.c <- predict(lm(dur~ns(age,df=2),data=dat)) lines(dat$age,yhat.a, col="green", type="l", lwd=2) lines(dat$age,yhat.b, col="red", type="l", lwd=1) lines(dat$age,yhat.c, col="violet", type="l", lwd=2) legend(20, 250, c("lowess","linear","quadratic","spline"), lwd= c(2,1,2,2), col =c("blue","red","green","violet")) #fit neural network model set.seed(123) n1 <<- nnet(dur~age+dist, data=dat ,linout=T , size=10, rang=0.1, decay=.5, maxit=400) yhat.n <- predict(n1,dat)[,1] scatterplot3d(dat$age,dat$dist,yhat.n, color="blue") title("neural net predictions") r1 <<- lm(dur~ age + I(age^2) + dist, data=dat) yhat.r <- predict(r1) scatterplot3d(dat$age,dat$dist,yhat.r, color="green") title("regression predictions") #compare the R^2 of both models y <- dat$dur cor(y,yhat.r)^2 cor(y,yhat.n)^2 #compare goodness of fit par(mfrow=c(2,2)) scatterplot3d(dat$age, dat$dist, dat$dur, highlight.3d=TRUE); title("actual") scatterplot3d(dat$age,dat$dist,yhat.n, color="blue"); title("nnet") scatterplot3d(dat$age,dat$dist,yhat.r, color="green"); title("regression") dat2 <- cbind(dat,yhat.r, yhat.n) pairs(dat2) #model diagnostics - compare Regression and Neural Net par(mfrow=c(2,2)) plot(y,yhat.r, ylim=c(0,200)) abline(0,1,col="blue",lw=2) err <- y-yhat.r plot(yhat.r,err,xlim=c(0,200), ylim=c(-100,100)) abline(h=0,col="green",lw=2) mtext("Multiple Regression Model" ,side=3,outer=T,line=-2,col="blue",cex=1.5,font=3) plot(y,yhat.n, ylim=c(0,200)) abline(0,1,col="blue",lw=2) err <- y-yhat.n plot(yhat.n,err,xlim=c(0,200), ylim=c(-100,100)) abline(h=0,col="green",lw=2) mtext("Neural Net Model" ,side=3,outer=T,line=-21,col="blue",cex=1.5,font=3)