############################################################################
#	Actuarial Modeling in R
#	CAS Spring Meeting June, 2007
#	Jim Guszcza
#	R code
############################################################################
library(MASS)
library(nnet)
library(scatterplot3d)
set.seed(123)

############################################################################
#	Warm-up
############################################################################
#working with vectors
a <- 1:3
b <- seq(2,6,by=2)
a; b   			#use ";" to separate two commands
a+b; b+a; a-b; b-a	#elementary vector arithmetic
a*b; a/b; b/a		#more arithmetic
a**b; a^b			#two ways of exponentiating
a %*% b			#the dot product
t(a) %*% b			#the dot product
length(a); length(b)
c <- 5*a + b
c; length(c)
objects()
rm(a,b,c)
objects()

#working with matricies
a <- 1:12
b <- matrix(a, ncol=3)
a;b
length(a); length(b)
dim(a)		#a is a vector - therefore "dimensonless"
dim(b)		#b is a 4-by-3 matrix
b[2,]
b[,2]




############################################################################
#	Example 0
#	MLE fit Gamma data
############################################################################
#simultate data from Gamma distribution
x <- rgamma(10000, shape=5, rate = .1)
summary(x)
truehist(x)
lines(density(x), col="red", lwd=2)
title("Gamma(5, 0.1) distribution")

#MLE estimate of gamma parameters
fitdistr(x, "gamma")

?fitdistr		#on-line manual
fitdistr		#R is open source!


############################################################################
#	Example 1
#	Fitting Loss Distribution
#
#	This example will use loss distribution data
#	from the COTOR Round 2 challenge
#	(available on the CAS web site)
############################################################################
#dataset 1:  losses from CA reserving project

#dataset 2:  losses from COTOR Round 2 Challenge
cotor <- read.csv("cotor.txt", header = FALSE, sep = ",")
loss <- cotor[,1]
dim(loss); length(loss)
truehist(loss)

#try taking loss of loss
logloss <- log(loss)
truehist(logloss)
fitdistr(logloss, "gamma")

#add the fit log-gamma to the histogram plot
fit <- fitdistr(logloss, "gamma")
sshape <- fit$estimate[1]
rrate <- fit$estimate[2]
sshape; rrate
lines(density(rgamma(5000,sshape,rrate)), col="red", lwd=2)

#add kernel density estimate of the empirical distribution to plot
lines(density(logloss), col="blue", lwd=2)
title("logloss histogram with kernel density estimate")

#start model disgnostics here
par(mfrow=c(2,2))
truehist(logloss)
lines(density(logloss), col="blue", lwd=2)
title("actual data w/ kernel estimate")


#create vector of initial values for MLE estimation
#p0 <- c(p=mean(logloss<10),mu1=7,sd1=2,mu2=8,sd2=2)
mu <- mean(logloss); sd <- sd(logloss)
p0 <- c(p=mean(logloss<1),mu1=mu,sd1=sd,mu2=mu,sd2=sd)

#this is an R function
loglik <- function(p,x)
{
   e <- p[1] * dnorm((x - p[2])/p[3])/p[3] +
          (1-p[1]) * dnorm((x - p[4])/p[5])/p[5]
   if(any(e <= 0)) Inf else -sum(log(e))
}

	
#do MLE computation
parm <- optim(p0, loglik, x=logloss)$par
parm
p  <- parm[1];mu1 <- parm[2];sd1 <- parm[3];mu2 <- parm[4];sd2 <- parm[5]

#pull 10000 draws from the modeled distribution
a <- rnorm(p*10000,mean=mu1,sd=sd1) 
b <- rnorm((1-p)*10000,mean=mu2,sd=sd2)
modeled <- c(a,b)
truehist(modeled)
lines(density(modeled), col="red", lwd=2)
title("modeled loss distribution")

#superimpose modeled distribution on the actual data
truehist(logloss)
lines(density(modeled), col="red", lwd=2)
title("actual data w/ modeled dist")

#diagnostic plot of modeled distribution
qqplot(modeled,logloss)
abline(0,1, col="blue", lwd=2)
title("QQ plot")


############################################################################
#	Example 2
#	Claim Duration data - non-linear predictive modeling
#
#	Note:  this data was simulated by Jim Guszcza
#	purely for illustrative purposes.
#	The relationships between the variables are
#	not designed to be realistic.
############################################################################
dat <- read.csv("claim_duration.txt", header = TRUE, sep = ",")
summary(dat); dim(dat)
dat[1:10,]
cor(dat)

#univariate plots
par(mfrow=c(2,2))
truehist(dat$age)
truehist(dat$dist)
truehist(dat$dur)

#bi-variate plots
pairs(dat)

age <- dat$age
dist <- dat$dist
dur <- dat$dur

#tri-variate plot
scatterplot3d(age, dist, dur, highlight.3d=TRUE)
title("actual data")


#fit neural network model
set.seed(123)
n1 <<- nnet(dur~age+dist, data=dat ,linout=T
		, size=10, rang=0.1, decay=.5, maxit=400)

yhat.n <- predict(n1,dat)[,1]
scatterplot3d(dat$age,dat$dist,yhat.n, color="blue")	
title("neural net predictions")

r1 <<- lm(dur~ age + I(age^2) + dist, data=dat)
yhat.r <- predict(r1)
scatterplot3d(dat$age,dat$dist,yhat.r, color="green")	
title("regression predictions")

#compare the R^2 of both models
y <- dat$dur
cor(y,yhat.r)^2
cor(y,yhat.n)^2
	
#compare goodness of fit
par(mfrow=c(2,2))
scatterplot3d(age, dist, dur, highlight.3d=TRUE); title("actual")
scatterplot3d(dat$age,dat$dist,yhat.n, color="blue"); title("nnet")	
scatterplot3d(dat$age,dat$dist,yhat.r, color="green"); title("regression")


#model diagnostics - compare Regression and Neural Net
par(mfrow=c(2,2))
plot(y,yhat.r, ylim=c(0,200))
abline(0,1,col="blue",lw=2)
err <- y-yhat.r
plot(yhat.r,err,xlim=c(0,200), ylim=c(-100,100))
abline(h=0,col="green",lw=2)

mtext("Multiple Regression Model"
	,side=3,outer=T,line=-2,col="blue",cex=1.5,font=3)

plot(y,yhat.n, ylim=c(0,200))
abline(0,1,col="blue",lw=2)
err <- y-yhat.n
plot(yhat.n,err,xlim=c(0,200), ylim=c(-100,100))
abline(h=0,col="green",lw=2)
	
mtext("Neural Net Model"
	,side=3,outer=T,line=-21,col="blue",cex=1.5,font=3)